diff mbox series

KVM: x86: optimize check for valid PAT value

Message ID 1554890126-347-1-git-send-email-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: optimize check for valid PAT value | expand

Commit Message

Paolo Bonzini April 10, 2019, 9:55 a.m. UTC
This check will soon be done on every nested vmentry and vmexit,
"parallelize" it using bitwise operations.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mtrr.c    | 10 +---------
 arch/x86/kvm/vmx/vmx.c |  2 +-
 arch/x86/kvm/x86.h     |  8 ++++++++
 3 files changed, 10 insertions(+), 10 deletions(-)

Comments

David Laight April 10, 2019, 12:55 p.m. UTC | #1
From: Paolo Bonzini
> Sent: 10 April 2019 10:55
> 
> This check will soon be done on every nested vmentry and vmexit,
> "parallelize" it using bitwise operations.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
...
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 28406aa1136d..7bc7ac9d2a44 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -347,4 +347,12 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
>  	__this_cpu_write(current_vcpu, NULL);
>  }
> 
> +static inline bool kvm_pat_valid(u64 data)
> +{
> +	if (data & 0xF8F8F8F8F8F8F8F8)
> +		return false;
> +	/* 0, 1, 4, 5, 6, 7 are valid values.  */
> +	return (data | ((data & 0x0202020202020202) << 1)) == data;
> +}
> +

How about:
	/*
	 * Each byte must be 0, 1, 4, 5, 6 or 7.
	 * Convert 001x to 011x then 100x so 2 and 3 fail the test.
	 */
	data |= (data ^ 0x0404040404040404ULL)) + 0x0202020202020202ULL;
	if (data & 0xF8F8F8F8F8F8F8F8ULL)
		return false;

   David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Sean Christopherson April 10, 2019, 2:57 p.m. UTC | #2
On Wed, Apr 10, 2019 at 12:55:53PM +0000, David Laight wrote:
> From: Paolo Bonzini
> > Sent: 10 April 2019 10:55
> > 
> > This check will soon be done on every nested vmentry and vmexit,
> > "parallelize" it using bitwise operations.
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> ...
> > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> > index 28406aa1136d..7bc7ac9d2a44 100644
> > --- a/arch/x86/kvm/x86.h
> > +++ b/arch/x86/kvm/x86.h
> > @@ -347,4 +347,12 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> >  	__this_cpu_write(current_vcpu, NULL);
> >  }
> > 
> > +static inline bool kvm_pat_valid(u64 data)
> > +{
> > +	if (data & 0xF8F8F8F8F8F8F8F8)
> > +		return false;
> > +	/* 0, 1, 4, 5, 6, 7 are valid values.  */
> > +	return (data | ((data & 0x0202020202020202) << 1)) == data;
> > +}
> > +
> 
> How about:
> 	/*
> 	 * Each byte must be 0, 1, 4, 5, 6 or 7.
> 	 * Convert 001x to 011x then 100x so 2 and 3 fail the test.
> 	 */
> 	data |= (data ^ 0x0404040404040404ULL)) + 0x0202020202020202ULL;
> 	if (data & 0xF8F8F8F8F8F8F8F8ULL)
> 		return false;

Woah.  My vote is for Paolo's version as the separate checks allow the
reader to walk through step-by-step.  The generated assembly isn't much
different from a performance perspective since the TEST+JNE will be not
taken in the fast path.

Fancy:
   0x000000000004844f <+255>:   movabs $0xf8f8f8f8f8f8f8f8,%rcx
   0x0000000000048459 <+265>:   xor    %eax,%eax
   0x000000000004845b <+267>:   test   %rcx,%rdx
   0x000000000004845e <+270>:   jne    0x4848b <kvm_mtrr_valid+315>
   0x0000000000048460 <+272>:   movabs $0x202020202020202,%rax
   0x000000000004846a <+282>:   and    %rdx,%rax
   0x000000000004846d <+285>:   add    %rax,%rax
   0x0000000000048470 <+288>:   or     %rdx,%rax
   0x0000000000048473 <+291>:   cmp    %rdx,%rax
   0x0000000000048476 <+294>:   sete   %al
   0x0000000000048479 <+297>:   retq

Really fancy:
   0x0000000000048447 <+247>:   movabs $0x404040404040404,%rcx
   0x0000000000048451 <+257>:   movabs $0x202020202020202,%rax
   0x000000000004845b <+267>:   xor    %rdx,%rcx
   0x000000000004845e <+270>:   add    %rax,%rcx
   0x0000000000048461 <+273>:   movabs $0xf8f8f8f8f8f8f8f8,%rax
   0x000000000004846b <+283>:   or     %rcx,%rdx
   0x000000000004846e <+286>:   test   %rax,%rdx
   0x0000000000048471 <+289>:   sete   %al
   0x0000000000048474 <+292>:   retq
Sean Christopherson April 10, 2019, 2:58 p.m. UTC | #3
On Wed, Apr 10, 2019 at 11:55:26AM +0200, Paolo Bonzini wrote:
> This check will soon be done on every nested vmentry and vmexit,
> "parallelize" it using bitwise operations.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Paolo Bonzini April 10, 2019, 5:09 p.m. UTC | #4
On 10/04/19 16:57, Sean Christopherson wrote:
> On Wed, Apr 10, 2019 at 12:55:53PM +0000, David Laight wrote:
>> From: Paolo Bonzini
>>> Sent: 10 April 2019 10:55
>>>
>>> This check will soon be done on every nested vmentry and vmexit,
>>> "parallelize" it using bitwise operations.
>>>
>>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>>> ---
>> ...
>>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>>> index 28406aa1136d..7bc7ac9d2a44 100644
>>> --- a/arch/x86/kvm/x86.h
>>> +++ b/arch/x86/kvm/x86.h
>>> @@ -347,4 +347,12 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
>>>  	__this_cpu_write(current_vcpu, NULL);
>>>  }
>>>
>>> +static inline bool kvm_pat_valid(u64 data)
>>> +{
>>> +	if (data & 0xF8F8F8F8F8F8F8F8)
>>> +		return false;
>>> +	/* 0, 1, 4, 5, 6, 7 are valid values.  */
>>> +	return (data | ((data & 0x0202020202020202) << 1)) == data;
>>> +}
>>> +
>>
>> How about:
>> 	/*
>> 	 * Each byte must be 0, 1, 4, 5, 6 or 7.
>> 	 * Convert 001x to 011x then 100x so 2 and 3 fail the test.
>> 	 */
>> 	data |= (data ^ 0x0404040404040404ULL)) + 0x0202020202020202ULL;
>> 	if (data & 0xF8F8F8F8F8F8F8F8ULL)
>> 		return false;
> 
> Woah.  My vote is for Paolo's version as the separate checks allow the
> reader to walk through step-by-step.  The generated assembly isn't much
> different from a performance perspective since the TEST+JNE will be not
> taken in the fast path.
> 
> Fancy:
>    0x000000000004844f <+255>:   movabs $0xf8f8f8f8f8f8f8f8,%rcx
>    0x0000000000048459 <+265>:   xor    %eax,%eax
>    0x000000000004845b <+267>:   test   %rcx,%rdx
>    0x000000000004845e <+270>:   jne    0x4848b <kvm_mtrr_valid+315>
>    0x0000000000048460 <+272>:   movabs $0x202020202020202,%rax
>    0x000000000004846a <+282>:   and    %rdx,%rax
>    0x000000000004846d <+285>:   add    %rax,%rax
>    0x0000000000048470 <+288>:   or     %rdx,%rax
>    0x0000000000048473 <+291>:   cmp    %rdx,%rax
>    0x0000000000048476 <+294>:   sete   %al
>    0x0000000000048479 <+297>:   retq
> 
> Really fancy:
>    0x0000000000048447 <+247>:   movabs $0x404040404040404,%rcx
>    0x0000000000048451 <+257>:   movabs $0x202020202020202,%rax
>    0x000000000004845b <+267>:   xor    %rdx,%rcx
>    0x000000000004845e <+270>:   add    %rax,%rcx
>    0x0000000000048461 <+273>:   movabs $0xf8f8f8f8f8f8f8f8,%rax
>    0x000000000004846b <+283>:   or     %rcx,%rdx
>    0x000000000004846e <+286>:   test   %rax,%rdx
>    0x0000000000048471 <+289>:   sete   %al
>    0x0000000000048474 <+292>:   retq

Yeah, the three constants are expensive.  Too bad the really fancy
version sums twos and xors fours; if it were the opposite, it could have
used lea and then I would have chosen that one just for the coolness factor.

(Quoting Avi, "mmu.c is designed around the fact that x86 has an
instruction to do "x = 12 + 9*y").

Paolo
Krish Sadhukhan April 10, 2019, 11:36 p.m. UTC | #5
On 04/10/2019 02:55 AM, Paolo Bonzini wrote:
> This check will soon be done on every nested vmentry and vmexit,
> "parallelize" it using bitwise operations.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/x86/kvm/mtrr.c    | 10 +---------
>   arch/x86/kvm/vmx/vmx.c |  2 +-
>   arch/x86/kvm/x86.h     |  8 ++++++++
>   3 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
> index e9ea2d45ae66..9f72cc427158 100644
> --- a/arch/x86/kvm/mtrr.c
> +++ b/arch/x86/kvm/mtrr.c
> @@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr)
>   	return false;
>   }
>   
> -static bool valid_pat_type(unsigned t)
> -{
> -	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
> -}
> -
>   static bool valid_mtrr_type(unsigned t)
>   {
>   	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
> @@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>   		return false;
>   
>   	if (msr == MSR_IA32_CR_PAT) {
> -		for (i = 0; i < 8; i++)
> -			if (!valid_pat_type((data >> (i * 8)) & 0xff))
> -				return false;
> -		return true;
> +		return kvm_pat_valid(data);
>   	} else if (msr == MSR_MTRRdefType) {
>   		if (data & ~0xcff)
>   			return false;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b6c533afbf27..b74679732cfc 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1891,7 +1891,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   		break;
>   	case MSR_IA32_CR_PAT:
>   		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> -			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> +			if (!kvm_pat_valid(data))
>   				return 1;
>   			vmcs_write64(GUEST_IA32_PAT, data);
>   			vcpu->arch.pat = data;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 28406aa1136d..7bc7ac9d2a44 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -347,4 +347,12 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
>   	__this_cpu_write(current_vcpu, NULL);
>   }
>   
> +static inline bool kvm_pat_valid(u64 data)
> +{
> +	if (data & 0xF8F8F8F8F8F8F8F8)
> +		return false;
> +	/* 0, 1, 4, 5, 6, 7 are valid values.  */
> +	return (data | ((data & 0x0202020202020202) << 1)) == data;
> +}
> +
>   #endif

Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
David Laight April 11, 2019, 9:06 a.m. UTC | #6
From: Paolo Bonzini
> Sent: 10 April 2019 18:09
> On 10/04/19 16:57, Sean Christopherson wrote:
> > On Wed, Apr 10, 2019 at 12:55:53PM +0000, David Laight wrote:
> >> From: Paolo Bonzini
> >>> Sent: 10 April 2019 10:55
> >>>
> >>> This check will soon be done on every nested vmentry and vmexit,
> >>> "parallelize" it using bitwise operations.
...
> >> How about:
> >> 	/*
> >> 	 * Each byte must be 0, 1, 4, 5, 6 or 7.
> >> 	 * Convert 001x to 011x then 100x so 2 and 3 fail the test.
> >> 	 */
> >> 	data |= (data ^ 0x0404040404040404ULL)) + 0x0202020202020202ULL;
> >> 	if (data & 0xF8F8F8F8F8F8F8F8ULL)
> >> 		return false;
...
> > Really fancy:
> >    0x0000000000048447 <+247>:   movabs $0x404040404040404,%rcx
> >    0x0000000000048451 <+257>:   movabs $0x202020202020202,%rax
> >    0x000000000004845b <+267>:   xor    %rdx,%rcx
> >    0x000000000004845e <+270>:   add    %rax,%rcx
> >    0x0000000000048461 <+273>:   movabs $0xf8f8f8f8f8f8f8f8,%rax
> >    0x000000000004846b <+283>:   or     %rcx,%rdx
> >    0x000000000004846e <+286>:   test   %rax,%rdx
> >    0x0000000000048471 <+289>:   sete   %al
> >    0x0000000000048474 <+292>:   retq
> 
> Yeah, the three constants are expensive.  Too bad the really fancy
> version sums twos and xors fours; if it were the opposite, it could have
> used lea and then I would have chosen that one just for the coolness factor.

The constants are just big, if the code is inlined they are probably
otherwise free (they won't change the length of the dependency chain).
The twos can be generated from the fours without increasing the length
of the dependency chain and saving some bytes.

It may be possible to generate shorter code that executes just as
fast by generating a single constant and deriving the others from it.
- generate 4s - needed first
- shift right 2 to get 1s (in parallel with the xor)
- use lea to get 6s (in parallel with an lea to do the add)
- invert the 1s to get FEs (also in parallel with the add)
- xor the FEs with the 6s to get F8s (in parallel with the or)
- and/test for the result
I'm not going to try to persuade gcc to go that.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Paolo Bonzini April 15, 2019, 8:11 a.m. UTC | #7
On 11/04/19 11:06, David Laight wrote:
> It may be possible to generate shorter code that executes just as
> fast by generating a single constant and deriving the others from it.
> - generate 4s - needed first
> - shift right 2 to get 1s (in parallel with the xor)
> - use lea to get 6s (in parallel with an lea to do the add)
> - invert the 1s to get FEs (also in parallel with the add)
> - xor the FEs with the 6s to get F8s (in parallel with the or)
> - and/test for the result

FWIW, here is yet another way to do it:

/* Change 6/7 to 4/5 */
data &= ~((data & 0x0404040404040404ULL) >> 1);
/* Only allow 0/1/4/5 now */
return !(data & 0xFAFAFAFAFAFAFAFAULL);

movabs $0x404040404040404, %rcx
andq   %rdx, %rcx
shrq   %rcx
notq   %rcx
movabs $0xFAFAFAFAFAFAFA, %rax
andq   %rcx, %rdx
test   %rax, %rdx

Paolo
David Laight April 15, 2019, 9:03 a.m. UTC | #8
From: Paolo Bonzini
> Sent: 15 April 2019 09:12
> On 11/04/19 11:06, David Laight wrote:
> > It may be possible to generate shorter code that executes just as
> > fast by generating a single constant and deriving the others from it.
> > - generate 4s - needed first
> > - shift right 2 to get 1s (in parallel with the xor)
> > - use lea to get 6s (in parallel with an lea to do the add)
> > - invert the 1s to get FEs (also in parallel with the add)
> > - xor the FEs with the 6s to get F8s (in parallel with the or)
> > - and/test for the result

That version needs an extra register move I hadn't allowed for.
It is also impossible to stop gcc folding constant expressions
without an asm nop on a register.

> FWIW, here is yet another way to do it:
> 
> /* Change 6/7 to 4/5 */
> data &= ~((data & 0x0404040404040404ULL) >> 1);
> /* Only allow 0/1/4/5 now */
> return !(data & 0xFAFAFAFAFAFAFAFAULL);
> 
> movabs $0x404040404040404, %rcx
> andq   %rdx, %rcx
> shrq   %rcx
> notq   %rcx
> movabs $0xFAFAFAFAFAFAFA, %rax
> andq   %rcx, %rdx
> test   %rax, %rdx

Fewer opcode bytes, but 5 dependant instructions
(assuming the first constant can executed in parallel
with an earlier instruction).
I think my one was only 4 dependant instructions.

All these are far faster than the loop...

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index e9ea2d45ae66..9f72cc427158 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -48,11 +48,6 @@  static bool msr_mtrr_valid(unsigned msr)
 	return false;
 }
 
-static bool valid_pat_type(unsigned t)
-{
-	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
 static bool valid_mtrr_type(unsigned t)
 {
 	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
@@ -67,10 +62,7 @@  bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		return false;
 
 	if (msr == MSR_IA32_CR_PAT) {
-		for (i = 0; i < 8; i++)
-			if (!valid_pat_type((data >> (i * 8)) & 0xff))
-				return false;
-		return true;
+		return kvm_pat_valid(data);
 	} else if (msr == MSR_MTRRdefType) {
 		if (data & ~0xcff)
 			return false;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b6c533afbf27..b74679732cfc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1891,7 +1891,7 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+			if (!kvm_pat_valid(data))
 				return 1;
 			vmcs_write64(GUEST_IA32_PAT, data);
 			vcpu->arch.pat = data;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 28406aa1136d..7bc7ac9d2a44 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -347,4 +347,12 @@  static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
 	__this_cpu_write(current_vcpu, NULL);
 }
 
+static inline bool kvm_pat_valid(u64 data)
+{
+	if (data & 0xF8F8F8F8F8F8F8F8)
+		return false;
+	/* 0, 1, 4, 5, 6, 7 are valid values.  */
+	return (data | ((data & 0x0202020202020202) << 1)) == data;
+}
+
 #endif