diff mbox series

locking/atomic/x86: Introduce try_cmpxchg64

Message ID 20220510154217.5216-1-ubizjak@gmail.com (mailing list archive)
State New, archived
Headers show
Series locking/atomic/x86: Introduce try_cmpxchg64 | expand

Commit Message

Uros Bizjak May 10, 2022, 3:42 p.m. UTC
This patch adds try_cmpxchg64 to improve code around cmpxchg8b.  While
the resulting code improvements on x86_64 are minor (a compare and a move saved),
the improvements on x86_32 are quite noticeable. The code improves from:

  84:    89 74 24 30              mov    %esi,0x30(%esp)
  88:    89 fe                    mov    %edi,%esi
  8a:    0f b7 0c 02              movzwl (%edx,%eax,1),%ecx
  8e:    c1 e1 08                 shl    $0x8,%ecx
  91:    0f b7 c9                 movzwl %cx,%ecx
  94:    89 4c 24 34              mov    %ecx,0x34(%esp)
  98:    8b 96 24 1e 00 00        mov    0x1e24(%esi),%edx
  9e:    8b 86 20 1e 00 00        mov    0x1e20(%esi),%eax
  a4:    8b 5c 24 34              mov    0x34(%esp),%ebx
  a8:    8b 7c 24 30              mov    0x30(%esp),%edi
  ac:    89 44 24 38              mov    %eax,0x38(%esp)
  b0:    0f b6 44 24 38           movzbl 0x38(%esp),%eax
  b5:    8b 4c 24 38              mov    0x38(%esp),%ecx
  b9:    89 54 24 3c              mov    %edx,0x3c(%esp)
  bd:    83 e0 fd                 and    $0xfffffffd,%eax
  c0:    89 5c 24 64              mov    %ebx,0x64(%esp)
  c4:    8b 54 24 3c              mov    0x3c(%esp),%edx
  c8:    89 4c 24 60              mov    %ecx,0x60(%esp)
  cc:    8b 4c 24 34              mov    0x34(%esp),%ecx
  d0:    88 44 24 60              mov    %al,0x60(%esp)
  d4:    8b 44 24 38              mov    0x38(%esp),%eax
  d8:    c6 44 24 62 f2           movb   $0xf2,0x62(%esp)
  dd:    8b 5c 24 60              mov    0x60(%esp),%ebx
  e1:    f0 0f c7 0f              lock cmpxchg8b (%edi)
  e5:    89 d1                    mov    %edx,%ecx
  e7:    8b 54 24 3c              mov    0x3c(%esp),%edx
  eb:    33 44 24 38              xor    0x38(%esp),%eax
  ef:    31 ca                    xor    %ecx,%edx
  f1:    09 c2                    or     %eax,%edx
  f3:    75 a3                    jne    98 <t+0x98>

to:

  84:    0f b7 0c 02              movzwl (%edx,%eax,1),%ecx
  88:    c1 e1 08                 shl    $0x8,%ecx
  8b:    0f b7 c9                 movzwl %cx,%ecx
  8e:    8b 86 20 1e 00 00        mov    0x1e20(%esi),%eax
  94:    8b 96 24 1e 00 00        mov    0x1e24(%esi),%edx
  9a:    89 4c 24 64              mov    %ecx,0x64(%esp)
  9e:    89 c3                    mov    %eax,%ebx
  a0:    89 44 24 60              mov    %eax,0x60(%esp)
  a4:    83 e3 fd                 and    $0xfffffffd,%ebx
  a7:    c6 44 24 62 f2           movb   $0xf2,0x62(%esp)
  ac:    88 5c 24 60              mov    %bl,0x60(%esp)
  b0:    8b 5c 24 60              mov    0x60(%esp),%ebx
  b4:    f0 0f c7 0f              lock cmpxchg8b (%edi)
  b8:    75 d4                    jne    8e <t+0x8e>

The implementation extends the implementation of existing cmpxchg64.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Marco Elver <elver@google.com>
---
 arch/x86/include/asm/cmpxchg_32.h          | 43 ++++++++++++++++++++++
 arch/x86/include/asm/cmpxchg_64.h          |  6 +++
 include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
 scripts/atomic/gen-atomic-instrumented.sh  |  2 +-
 4 files changed, 89 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra May 10, 2022, 4:55 p.m. UTC | #1
On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
> This patch adds try_cmpxchg64 to improve code around cmpxchg8b.  While
> the resulting code improvements on x86_64 are minor (a compare and a move saved),
> the improvements on x86_32 are quite noticeable. The code improves from:

What user of cmpxchg64 is this?
Uros Bizjak May 10, 2022, 5:07 p.m. UTC | #2
On Tue, May 10, 2022 at 6:55 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
> > This patch adds try_cmpxchg64 to improve code around cmpxchg8b.  While
> > the resulting code improvements on x86_64 are minor (a compare and a move saved),
> > the improvements on x86_32 are quite noticeable. The code improves from:
>
> What user of cmpxchg64 is this?

This is cmpxchg64 in pi_try_set_control from
arch/x86/kvm/vmx/posted_intr.c, as shown in a RFC patch [1].

There are some more opportunities for try_cmpxchg64 in KVM, namely
fast_pf_fix_direct_spte in arch/x86/kvm/mmu/mmu.c and
tdp_mmu_set_spte_atomic in arch/x86/kvm/mmu/tdp_mmu.c

[1] https://www.spinics.net/lists/kvm/msg266042.html

Uros.
Peter Zijlstra May 11, 2022, 7:54 a.m. UTC | #3
On Tue, May 10, 2022 at 07:07:25PM +0200, Uros Bizjak wrote:
> On Tue, May 10, 2022 at 6:55 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
> > > This patch adds try_cmpxchg64 to improve code around cmpxchg8b.  While
> > > the resulting code improvements on x86_64 are minor (a compare and a move saved),
> > > the improvements on x86_32 are quite noticeable. The code improves from:
> >
> > What user of cmpxchg64 is this?
> 
> This is cmpxchg64 in pi_try_set_control from
> arch/x86/kvm/vmx/posted_intr.c, as shown in a RFC patch [1].

I can't read that code, my brain is hard wired to read pi as priority
inheritance/inversion.

Still, does 32bit actually support that stuff?

> There are some more opportunities for try_cmpxchg64 in KVM, namely
> fast_pf_fix_direct_spte in arch/x86/kvm/mmu/mmu.c and
> tdp_mmu_set_spte_atomic in arch/x86/kvm/mmu/tdp_mmu.c

tdp_mmu is definitely 64bit only and as such shouldn't need to use
cmpxchg64.


Anyway, your patch looks about right, but I find it *really* hard to
care about 32bit code these days.
Uros Bizjak May 11, 2022, 8:24 a.m. UTC | #4
On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, May 10, 2022 at 07:07:25PM +0200, Uros Bizjak wrote:
> > On Tue, May 10, 2022 at 6:55 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
> > > > This patch adds try_cmpxchg64 to improve code around cmpxchg8b.  While
> > > > the resulting code improvements on x86_64 are minor (a compare and a move saved),
> > > > the improvements on x86_32 are quite noticeable. The code improves from:
> > >
> > > What user of cmpxchg64 is this?
> >
> > This is cmpxchg64 in pi_try_set_control from
> > arch/x86/kvm/vmx/posted_intr.c, as shown in a RFC patch [1].
>
> I can't read that code, my brain is hard wired to read pi as priority
> inheritance/inversion.
>
> Still, does 32bit actually support that stuff?

Unfortunately, it does:

kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
               vmx/evmcs.o vmx/nested.o vmx/posted_intr.o

And when existing cmpxchg64 is substituted with cmpxchg, the
compilation dies for 32bits with:

error: call to ‘__cmpxchg_wrong_size’ declared with attribute error:
Bad argument size for cmpxchg

So, the majority of the patch deals with 32bits and tries to implement
the inlined insn correctly for all cases. The 64bit part is simply a
call to arch_try_cmpxchg, and the rest is auto-generated from scripts.

>
> > There are some more opportunities for try_cmpxchg64 in KVM, namely
> > fast_pf_fix_direct_spte in arch/x86/kvm/mmu/mmu.c and
> > tdp_mmu_set_spte_atomic in arch/x86/kvm/mmu/tdp_mmu.c
>
> tdp_mmu is definitely 64bit only and as such shouldn't need to use
> cmpxchg64.

Indeed.

>
> Anyway, your patch looks about right, but I find it *really* hard to
> care about 32bit code these days.

Thanks, this is also my sentiment, but I hope the patch will enable
better code and perhaps ease similar situation I have had elsewhere.

Uros.

On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, May 10, 2022 at 07:07:25PM +0200, Uros Bizjak wrote:
> > On Tue, May 10, 2022 at 6:55 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
> > > > This patch adds try_cmpxchg64 to improve code around cmpxchg8b.  While
> > > > the resulting code improvements on x86_64 are minor (a compare and a move saved),
> > > > the improvements on x86_32 are quite noticeable. The code improves from:
> > >
> > > What user of cmpxchg64 is this?
> >
> > This is cmpxchg64 in pi_try_set_control from
> > arch/x86/kvm/vmx/posted_intr.c, as shown in a RFC patch [1].
>
> I can't read that code, my brain is hard wired to read pi as priority
> inheritance/inversion.
>
> Still, does 32bit actually support that stuff?

Unfortunately, it does:

kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
               vmx/evmcs.o vmx/nested.o vmx/posted_intr.o

And when cmpxchg64 is substituted with cmpxchg, the compilation dies
for 32bits with:

error: call to ‘__cmpxchg_wrong_size’ declared with attribute error:
Bad argument size for cmpxchg

So, the majority of the patch deals with 32bits and tries to implement
the inlined insn correctly for all cases. The 64bit part is simply a
call to arch_try_cmpxchg, and the rest is auto-generated from scripts.

>
> > There are some more opportunities for try_cmpxchg64 in KVM, namely
> > fast_pf_fix_direct_spte in arch/x86/kvm/mmu/mmu.c and
> > tdp_mmu_set_spte_atomic in arch/x86/kvm/mmu/tdp_mmu.c
>
> tdp_mmu is definitely 64bit only and as such shouldn't need to use
> cmpxchg64.

Indeed.

>
> Anyway, your patch looks about right, but I find it *really* hard to
> care about 32bit code these days.

Thanks, this is also my sentiment, but I hope the patch will enable
better code and perhaps ease a similar situation elsewhere in the
sources.

Uros.
Sean Christopherson May 11, 2022, 4:04 p.m. UTC | #5
On Wed, May 11, 2022, Uros Bizjak wrote:
> On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > Still, does 32bit actually support that stuff?
> 
> Unfortunately, it does:
> 
> kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
>                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> 
> And when existing cmpxchg64 is substituted with cmpxchg, the
> compilation dies for 32bits with:

...

> > Anyway, your patch looks about right, but I find it *really* hard to
> > care about 32bit code these days.
> 
> Thanks, this is also my sentiment, but I hope the patch will enable
> better code and perhaps ease similar situation I have had elsewhere.

IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
KVM still supports 32-bit kernels, but I'm fairly certain the only people that
run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
multiple releases at least once, maybe twice, and no one ever complained.

32-bit KVM is mostly useful for testing the mess that is nested NPT; an L1
hypervsior can use 32-bit paging for NPT, so KVM needs to at least make sure it
doesn't blow up if such a hypervisor is encountered.  But in terms of the performance
of 32-bit KVM, I doubt there is a person in the world that cares.
Uros Bizjak May 11, 2022, 7:54 p.m. UTC | #6
On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, May 11, 2022, Uros Bizjak wrote:
> > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > Still, does 32bit actually support that stuff?
> >
> > Unfortunately, it does:
> >
> > kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> >                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> >
> > And when existing cmpxchg64 is substituted with cmpxchg, the
> > compilation dies for 32bits with:
>
> ...
>
> > > Anyway, your patch looks about right, but I find it *really* hard to
> > > care about 32bit code these days.
> >
> > Thanks, this is also my sentiment, but I hope the patch will enable
> > better code and perhaps ease similar situation I have had elsewhere.
>
> IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
> KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
> multiple releases at least once, maybe twice, and no one ever complained.

Yes, the idea was to improve cmpxchg64 with the implementation of
try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
stood in the way, so the effort with 32-bit implementation was mainly
to unblock progression for 64-bit targets.

Uros.
Peter Zijlstra May 13, 2022, 9:10 a.m. UTC | #7
On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:

For the Changelog I would focus on the 64bit improvement and leave 32bit
as a side-note.

> ---
>  arch/x86/include/asm/cmpxchg_32.h          | 43 ++++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_64.h          |  6 +++
>  include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
>  scripts/atomic/gen-atomic-instrumented.sh  |  2 +-
>  4 files changed, 89 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
> index 0a7fe0321613..e874ff7f7529 100644
> --- a/arch/x86/include/asm/cmpxchg_32.h
> +++ b/arch/x86/include/asm/cmpxchg_32.h
> @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
>  #define arch_cmpxchg64_local(ptr, o, n)					\
>  	((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
>  					       (unsigned long long)(n)))
> +#define arch_try_cmpxchg64(ptr, po, n)					\
> +	((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
> +					     (unsigned long long)(n)))
>  #endif
>  
>  static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
> @@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
>  	return prev;
>  }
>  
> +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
> +{
> +	bool success;
> +	u64 prev;
> +	asm volatile(LOCK_PREFIX "cmpxchg8b %2"
> +		     CC_SET(z)
> +		     : CC_OUT(z) (success),
> +		       "=A" (prev),
> +		       "+m" (*ptr)
> +		     : "b" ((u32)new),
> +		       "c" ((u32)(new >> 32)),
> +		       "1" (*pold)
> +		     : "memory");
> +
> +	if (unlikely(!success))
> +		*pold = prev;

I would prefer this be more like the existing try_cmpxchg code,
perhaps:

	u64 old = *pold;

	asm volatile (LOCK_PREFIX "cmpxchg8b %[ptr]"
		      CC_SET(z)
		      : CC_OUT(z) (success),
		        [ptr] "+m" (*ptr)
		        "+A" (old)
		      : "b" ((u32)new)
		        "c" ((u32)(new >> 32))
		      : "memory");

	if (unlikely(!success))
		*pold = old;

The existing 32bit cmpxchg code is a 'bit' crusty.

> +	return success;
> +}
> +
>  #ifndef CONFIG_X86_CMPXCHG64
>  /*
>   * Building a kernel capable running on 80386 and 80486. It may be necessary
> @@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
>  		       : "memory");				\
>  	__ret; })
>  
> +#define arch_try_cmpxchg64(ptr, po, n)				\
> +({								\
> +	bool success;						\
> +	__typeof__(*(ptr)) __prev;				\
> +	__typeof__(ptr) _old = (__typeof__(ptr))(po);		\
> +	__typeof__(*(ptr)) __old = *_old;			\
> +	__typeof__(*(ptr)) __new = (n);				\
> +	alternative_io(LOCK_PREFIX_HERE				\
> +			"call cmpxchg8b_emu",			\
> +			"lock; cmpxchg8b (%%esi)" ,		\
> +		       X86_FEATURE_CX8,				\
> +		       "=A" (__prev),				\
> +		       "S" ((ptr)), "0" (__old),		\
> +		       "b" ((unsigned int)__new),		\
> +		       "c" ((unsigned int)(__new>>32))		\
> +		       : "memory");				\
> +	success = (__prev == __old);				\
> +	if (unlikely(!success))					\
> +		*_old = __prev;					\
> +	likely(success);					\
> +})

Wouldn't this be better written like the normal fallback wrapper?

static __always_inline bool
arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
{
	u64 r, o = *old;
	r = arch_cmpxchg64(v, o, new);
	if (unlikely(r != o))
		*old = r;
	return likely(r == o);
}

Less magical, same exact code.
Uros Bizjak May 13, 2022, 10:20 a.m. UTC | #8
On Fri, May 13, 2022 at 11:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
>
> For the Changelog I would focus on the 64bit improvement and leave 32bit
> as a side-note.

Thanks, I will rephrase the ChangeLog.

>
> > ---
> >  arch/x86/include/asm/cmpxchg_32.h          | 43 ++++++++++++++++++++++
> >  arch/x86/include/asm/cmpxchg_64.h          |  6 +++
> >  include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
> >  scripts/atomic/gen-atomic-instrumented.sh  |  2 +-
> >  4 files changed, 89 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
> > index 0a7fe0321613..e874ff7f7529 100644
> > --- a/arch/x86/include/asm/cmpxchg_32.h
> > +++ b/arch/x86/include/asm/cmpxchg_32.h
> > @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
> >  #define arch_cmpxchg64_local(ptr, o, n)                                      \
> >       ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
> >                                              (unsigned long long)(n)))
> > +#define arch_try_cmpxchg64(ptr, po, n)                                       \
> > +     ((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
> > +                                          (unsigned long long)(n)))
> >  #endif
> >
> >  static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
> > @@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
> >       return prev;
> >  }
> >
> > +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
> > +{
> > +     bool success;
> > +     u64 prev;
> > +     asm volatile(LOCK_PREFIX "cmpxchg8b %2"
> > +                  CC_SET(z)
> > +                  : CC_OUT(z) (success),
> > +                    "=A" (prev),
> > +                    "+m" (*ptr)
> > +                  : "b" ((u32)new),
> > +                    "c" ((u32)(new >> 32)),
> > +                    "1" (*pold)
> > +                  : "memory");
> > +
> > +     if (unlikely(!success))
> > +             *pold = prev;
>
> I would prefer this be more like the existing try_cmpxchg code,
> perhaps:
>
>         u64 old = *pold;
>
>         asm volatile (LOCK_PREFIX "cmpxchg8b %[ptr]"
>                       CC_SET(z)
>                       : CC_OUT(z) (success),
>                         [ptr] "+m" (*ptr)
>                         "+A" (old)
>                       : "b" ((u32)new)
>                         "c" ((u32)(new >> 32))
>                       : "memory");
>
>         if (unlikely(!success))
>                 *pold = old;
>
> The existing 32bit cmpxchg code is a 'bit' crusty.

I was trying to follow the existing __cmpxchg64 as much as possible,
with the intention of a follow-up patch that would modernize
everything in cmpxchg_32.h. I can surely go the other way and submit
modernized new code.

> > +     return success;
> > +}
> > +
> >  #ifndef CONFIG_X86_CMPXCHG64
> >  /*
> >   * Building a kernel capable running on 80386 and 80486. It may be necessary
> > @@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
> >                      : "memory");                             \
> >       __ret; })
> >
> > +#define arch_try_cmpxchg64(ptr, po, n)                               \
> > +({                                                           \
> > +     bool success;                                           \
> > +     __typeof__(*(ptr)) __prev;                              \
> > +     __typeof__(ptr) _old = (__typeof__(ptr))(po);           \
> > +     __typeof__(*(ptr)) __old = *_old;                       \
> > +     __typeof__(*(ptr)) __new = (n);                         \
> > +     alternative_io(LOCK_PREFIX_HERE                         \
> > +                     "call cmpxchg8b_emu",                   \
> > +                     "lock; cmpxchg8b (%%esi)" ,             \
> > +                    X86_FEATURE_CX8,                         \
> > +                    "=A" (__prev),                           \
> > +                    "S" ((ptr)), "0" (__old),                \
> > +                    "b" ((unsigned int)__new),               \
> > +                    "c" ((unsigned int)(__new>>32))          \
> > +                    : "memory");                             \
> > +     success = (__prev == __old);                            \
> > +     if (unlikely(!success))                                 \
> > +             *_old = __prev;                                 \
> > +     likely(success);                                        \
> > +})
>
> Wouldn't this be better written like the normal fallback wrapper?
>
> static __always_inline bool
> arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
> {
>         u64 r, o = *old;
>         r = arch_cmpxchg64(v, o, new);
>         if (unlikely(r != o))
>                 *old = r;
>         return likely(r == o);
> }
>
> Less magical, same exact code.

Also, I tried to follow up the existing #defines. Will improve the
code according to your suggestion here.

Thanks,
Uros.
Uros Bizjak May 13, 2022, 3:36 p.m. UTC | #9
On Fri, May 13, 2022 at 12:20 PM Uros Bizjak <ubizjak@gmail.com> wrote:

> > > +#define arch_try_cmpxchg64(ptr, po, n)                               \
> > > +({                                                           \
> > > +     bool success;                                           \
> > > +     __typeof__(*(ptr)) __prev;                              \
> > > +     __typeof__(ptr) _old = (__typeof__(ptr))(po);           \
> > > +     __typeof__(*(ptr)) __old = *_old;                       \
> > > +     __typeof__(*(ptr)) __new = (n);                         \
> > > +     alternative_io(LOCK_PREFIX_HERE                         \
> > > +                     "call cmpxchg8b_emu",                   \
> > > +                     "lock; cmpxchg8b (%%esi)" ,             \
> > > +                    X86_FEATURE_CX8,                         \
> > > +                    "=A" (__prev),                           \
> > > +                    "S" ((ptr)), "0" (__old),                \
> > > +                    "b" ((unsigned int)__new),               \
> > > +                    "c" ((unsigned int)(__new>>32))          \
> > > +                    : "memory");                             \
> > > +     success = (__prev == __old);                            \
> > > +     if (unlikely(!success))                                 \
> > > +             *_old = __prev;                                 \
> > > +     likely(success);                                        \
> > > +})
> >
> > Wouldn't this be better written like the normal fallback wrapper?
> >
> > static __always_inline bool
> > arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
> > {
> >         u64 r, o = *old;
> >         r = arch_cmpxchg64(v, o, new);
> >         if (unlikely(r != o))
> >                 *old = r;
> >         return likely(r == o);
> > }
> >
> > Less magical, same exact code.
>
> Also, I tried to follow up the existing #defines. Will improve the
> code according to your suggestion here.

In the v2 patch, generic fallbacks were introduced, so that
arch_try_cmpxchg64 can be used when only arch_cmpxchg64 is defined.

Uros.
Maxim Levitsky May 16, 2022, 2:04 p.m. UTC | #10
On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <seanjc@google.com> wrote:
> > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > Still, does 32bit actually support that stuff?
> > > 
> > > Unfortunately, it does:
> > > 
> > > kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > >                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > 
> > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > compilation dies for 32bits with:
> > 
> > ...
> > 
> > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > care about 32bit code these days.
> > > 
> > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > better code and perhaps ease similar situation I have had elsewhere.
> > 
> > IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
> > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
> > multiple releases at least once, maybe twice, and no one ever complained.
> 
> Yes, the idea was to improve cmpxchg64 with the implementation of
> try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> stood in the way, so the effort with 32-bit implementation was mainly
> to unblock progression for 64-bit targets.

Would that allow tdp mmu to work on 32 bit?

Best regards,
	Maxim Levitsky

> 
> Uros.
>
Sean Christopherson May 16, 2022, 2:08 p.m. UTC | #11
On Mon, May 16, 2022, Maxim Levitsky wrote:
> On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <seanjc@google.com> wrote:
> > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > Still, does 32bit actually support that stuff?
> > > > 
> > > > Unfortunately, it does:
> > > > 
> > > > kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > >                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > 
> > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > compilation dies for 32bits with:
> > > 
> > > ...
> > > 
> > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > care about 32bit code these days.
> > > > 
> > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > better code and perhaps ease similar situation I have had elsewhere.
> > > 
> > > IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
> > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
> > > multiple releases at least once, maybe twice, and no one ever complained.
> > 
> > Yes, the idea was to improve cmpxchg64 with the implementation of
> > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > stood in the way, so the effort with 32-bit implementation was mainly
> > to unblock progression for 64-bit targets.
> 
> Would that allow tdp mmu to work on 32 bit?

From a purely technical perspective, there's nothing that prevents enabling the
TDP MMU on 32-bit kernels.  The TDP MMU is 64-bit only to simplify the implementation
and to reduce the maintenance and validation costs.
Maxim Levitsky May 16, 2022, 2:49 p.m. UTC | #12
On Mon, 2022-05-16 at 14:08 +0000, Sean Christopherson wrote:
> On Mon, May 16, 2022, Maxim Levitsky wrote:
> > On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > Still, does 32bit actually support that stuff?
> > > > > 
> > > > > Unfortunately, it does:
> > > > > 
> > > > > kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > > >                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > > 
> > > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > > compilation dies for 32bits with:
> > > > 
> > > > ...
> > > > 
> > > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > > care about 32bit code these days.
> > > > > 
> > > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > > better code and perhaps ease similar situation I have had elsewhere.
> > > > 
> > > > IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
> > > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > > run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
> > > > multiple releases at least once, maybe twice, and no one ever complained.
> > > 
> > > Yes, the idea was to improve cmpxchg64 with the implementation of
> > > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > > stood in the way, so the effort with 32-bit implementation was mainly
> > > to unblock progression for 64-bit targets.
> > 
> > Would that allow tdp mmu to work on 32 bit?
> 
> From a purely technical perspective, there's nothing that prevents enabling the
> TDP MMU on 32-bit kernels.  The TDP MMU is 64-bit only to simplify the implementation
> and to reduce the maintenance and validation costs.
> 

I understand exactly that, so the question, will this patch help make the tdp mmu work transparently
on 32 bit kernels? I  heard that 64 bit cmpxchg was one of the main reasons that it is 64 bit only.

I am asking because there was some talk to eliminate the direct mode from the legacy non tdp mmu,
which would simplify its code by a lot, but then it will make 32 bit kernel fail back to shadowing mmu.

I know that nobody needs 32 bit KVM host support, but it is useful to be sure that nesting still works, and
doesn't crash the host and such.

Best regards,
	Maxim Levitsky
Sean Christopherson May 16, 2022, 3:14 p.m. UTC | #13
On Mon, May 16, 2022, Maxim Levitsky wrote:
> On Mon, 2022-05-16 at 14:08 +0000, Sean Christopherson wrote:
> > On Mon, May 16, 2022, Maxim Levitsky wrote:
> > > On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > > > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > > Still, does 32bit actually support that stuff?
> > > > > > 
> > > > > > Unfortunately, it does:
> > > > > > 
> > > > > > kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > > > >                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > > > 
> > > > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > > > compilation dies for 32bits with:
> > > > > 
> > > > > ...
> > > > > 
> > > > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > > > care about 32bit code these days.
> > > > > > 
> > > > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > > > better code and perhaps ease similar situation I have had elsewhere.
> > > > > 
> > > > > IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
> > > > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > > > run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
> > > > > multiple releases at least once, maybe twice, and no one ever complained.
> > > > 
> > > > Yes, the idea was to improve cmpxchg64 with the implementation of
> > > > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > > > stood in the way, so the effort with 32-bit implementation was mainly
> > > > to unblock progression for 64-bit targets.
> > > 
> > > Would that allow tdp mmu to work on 32 bit?
> > 
> > From a purely technical perspective, there's nothing that prevents enabling the
> > TDP MMU on 32-bit kernels.  The TDP MMU is 64-bit only to simplify the implementation
> > and to reduce the maintenance and validation costs.
> 
> I understand exactly that, so the question, will this patch help make the tdp
> mmu work transparently on 32 bit kernels? I  heard that 64 bit cmpxchg was
> one of the main reasons that it is 64 bit only.

I don't think it moves the needled much, e.g. non-atomic 64-bit accesses are still
problematic, and we'd have to update the TDP MMU to deal with PAE paging (thanks
NPT).  All those problems are solvable, it's purely a matter of the ongoing costs
to solve them.

> I am asking because there was some talk to eliminate the direct mode from the
> legacy non tdp mmu, which would simplify its code by a lot, but then it will
> make 32 bit kernel fail back to shadowing mmu.

Simplify which code?  Between the nonpaging code and direct shadow pages in
indirect MMUs, the vast majority of the "direct" support in the legacy MMU needs
to be kept even if TDP support is dropped.  And the really nasty stuff, e.g. PAE
roots, would need to be carried over to the TDP MMU.
Maxim Levitsky May 16, 2022, 3:36 p.m. UTC | #14
On Mon, 2022-05-16 at 15:14 +0000, Sean Christopherson wrote:
> On Mon, May 16, 2022, Maxim Levitsky wrote:
> > On Mon, 2022-05-16 at 14:08 +0000, Sean Christopherson wrote:
> > > On Mon, May 16, 2022, Maxim Levitsky wrote:
> > > > On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > > > > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > > > Still, does 32bit actually support that stuff?
> > > > > > > 
> > > > > > > Unfortunately, it does:
> > > > > > > 
> > > > > > > kvm-intel-y        += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > > > > >                vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > > > > 
> > > > > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > > > > compilation dies for 32bits with:
> > > > > > 
> > > > > > ...
> > > > > > 
> > > > > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > > > > care about 32bit code these days.
> > > > > > > 
> > > > > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > > > > better code and perhaps ease similar situation I have had elsewhere.
> > > > > > 
> > > > > > IMO, if we merge this it should be solely on the benefits to 64-bit code.  Yes,
> > > > > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > > > > run 32-bit KVM are KVM developers.  32-bit KVM has been completely broken for
> > > > > > multiple releases at least once, maybe twice, and no one ever complained.
> > > > > 
> > > > > Yes, the idea was to improve cmpxchg64 with the implementation of
> > > > > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > > > > stood in the way, so the effort with 32-bit implementation was mainly
> > > > > to unblock progression for 64-bit targets.
> > > > 
> > > > Would that allow tdp mmu to work on 32 bit?
> > > 
> > > From a purely technical perspective, there's nothing that prevents enabling the
> > > TDP MMU on 32-bit kernels.  The TDP MMU is 64-bit only to simplify the implementation
> > > and to reduce the maintenance and validation costs.
> > 
> > I understand exactly that, so the question, will this patch help make the tdp
> > mmu work transparently on 32 bit kernels? I  heard that 64 bit cmpxchg was
> > one of the main reasons that it is 64 bit only.
> 
> I don't think it moves the needled much, e.g. non-atomic 64-bit accesses are still
> problematic, and we'd have to update the TDP MMU to deal with PAE paging (thanks
> NPT).  All those problems are solvable, it's purely a matter of the ongoing costs
> to solve them.
> 
> > I am asking because there was some talk to eliminate the direct mode from the
> > legacy non tdp mmu, which would simplify its code by a lot, but then it will
> > make 32 bit kernel fail back to shadowing mmu.
> 
> Simplify which code?  Between the nonpaging code and direct shadow pages in
> indirect MMUs, the vast majority of the "direct" support in the legacy MMU needs
> to be kept even if TDP support is dropped.  And the really nasty stuff, e.g. PAE
> roots, would need to be carried over to the TDP MMU.
> 

I guess this makes sense. I haven't researched the code well enough to know the exact answer.
I was just curious if this patch makes any difference :)

Thanks!

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 0a7fe0321613..e874ff7f7529 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -42,6 +42,9 @@  static inline void set_64bit(volatile u64 *ptr, u64 value)
 #define arch_cmpxchg64_local(ptr, o, n)					\
 	((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
 					       (unsigned long long)(n)))
+#define arch_try_cmpxchg64(ptr, po, n)					\
+	((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
+					     (unsigned long long)(n)))
 #endif
 
 static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
@@ -70,6 +73,25 @@  static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
 	return prev;
 }
 
+static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+{
+	bool success;
+	u64 prev;
+	asm volatile(LOCK_PREFIX "cmpxchg8b %2"
+		     CC_SET(z)
+		     : CC_OUT(z) (success),
+		       "=A" (prev),
+		       "+m" (*ptr)
+		     : "b" ((u32)new),
+		       "c" ((u32)(new >> 32)),
+		       "1" (*pold)
+		     : "memory");
+
+	if (unlikely(!success))
+		*pold = prev;
+	return success;
+}
+
 #ifndef CONFIG_X86_CMPXCHG64
 /*
  * Building a kernel capable running on 80386 and 80486. It may be necessary
@@ -108,6 +130,27 @@  static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
 		       : "memory");				\
 	__ret; })
 
+#define arch_try_cmpxchg64(ptr, po, n)				\
+({								\
+	bool success;						\
+	__typeof__(*(ptr)) __prev;				\
+	__typeof__(ptr) _old = (__typeof__(ptr))(po);		\
+	__typeof__(*(ptr)) __old = *_old;			\
+	__typeof__(*(ptr)) __new = (n);				\
+	alternative_io(LOCK_PREFIX_HERE				\
+			"call cmpxchg8b_emu",			\
+			"lock; cmpxchg8b (%%esi)" ,		\
+		       X86_FEATURE_CX8,				\
+		       "=A" (__prev),				\
+		       "S" ((ptr)), "0" (__old),		\
+		       "b" ((unsigned int)__new),		\
+		       "c" ((unsigned int)(__new>>32))		\
+		       : "memory");				\
+	success = (__prev == __old);				\
+	if (unlikely(!success))					\
+		*_old = __prev;					\
+	likely(success);					\
+})
 #endif
 
 #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 072e5459fe2f..250187ac8248 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -19,6 +19,12 @@  static inline void set_64bit(volatile u64 *ptr, u64 val)
 	arch_cmpxchg_local((ptr), (o), (n));				\
 })
 
+#define arch_try_cmpxchg64(ptr, po, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	arch_try_cmpxchg((ptr), (po), (n));				\
+})
+
 #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
 
 #endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 5d69b143c28e..7a139ec030b0 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -2006,6 +2006,44 @@  atomic_long_dec_if_positive(atomic_long_t *v)
 	arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
+#define try_cmpxchg64(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_mb(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_acquire(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_release(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_release(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
 #define cmpxchg_local(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
@@ -2045,4 +2083,4 @@  atomic_long_dec_if_positive(atomic_long_t *v)
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 87c974b93032afd42143613434d1a7788fa598f9
+// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 68f902731d01..77c06526a574 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -166,7 +166,7 @@  grep '^[a-z]' "$1" | while read name meta args; do
 done
 
 
-for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg"; do
+for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg" "try_cmpxchg64"; do
 	for order in "" "_acquire" "_release" "_relaxed"; do
 		gen_xchg "${xchg}" "${order}" ""
 		printf "\n"