diff mbox series

[01/16] KVM: x86/mmu: Exit to userspace with -EFAULT if private fault hits emulation

Message ID 20240228024147.41573-2-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Page fault and MMIO cleanups | expand

Commit Message

Sean Christopherson Feb. 28, 2024, 2:41 a.m. UTC
Exit to userspace with -EFAULT / KVM_EXIT_MEMORY_FAULT if a private fault
triggers emulation of any kind, as KVM doesn't currently support emulating
access to guest private memory.  Practically speaking, private faults and
emulation are already mutually exclusive, but there are edge cases upon
edge cases where KVM can return RET_PF_EMULATE, and adding one last check
to harden against weird, unexpected combinations is inexpensive.

Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c          |  8 --------
 arch/x86/kvm/mmu/mmu_internal.h | 13 +++++++++++++
 2 files changed, 13 insertions(+), 8 deletions(-)

Comments

Xiaoyao Li March 1, 2024, 8:48 a.m. UTC | #1
On 2/28/2024 10:41 AM, Sean Christopherson wrote:
> Exit to userspace with -EFAULT / KVM_EXIT_MEMORY_FAULT if a private fault
> triggers emulation of any kind, as KVM doesn't currently support emulating
> access to guest private memory.  Practically speaking, private faults and
> emulation are already mutually exclusive, but there are edge cases upon
> edge cases where KVM can return RET_PF_EMULATE, and adding one last check
> to harden against weird, unexpected combinations is inexpensive.
> 
> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/mmu/mmu.c          |  8 --------
>   arch/x86/kvm/mmu/mmu_internal.h | 13 +++++++++++++
>   2 files changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e4cc7f764980..e2fd74e06ff8 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4309,14 +4309,6 @@ static inline u8 kvm_max_level_for_order(int order)
>   	return PG_LEVEL_4K;
>   }
>   
> -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> -					      struct kvm_page_fault *fault)
> -{
> -	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> -				      PAGE_SIZE, fault->write, fault->exec,
> -				      fault->is_private);
> -}
> -
>   static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
>   				   struct kvm_page_fault *fault)
>   {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 0669a8a668ca..0eea6c5a824d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -279,6 +279,14 @@ enum {
>   	RET_PF_SPURIOUS,
>   };
>   
> +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> +						     struct kvm_page_fault *fault)
> +{
> +	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> +				      PAGE_SIZE, fault->write, fault->exec,
> +				      fault->is_private);
> +}
> +
>   static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   					u32 err, bool prefetch, int *emulation_type)
>   {
> @@ -320,6 +328,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   	else
>   		r = vcpu->arch.mmu->page_fault(vcpu, &fault);

Beg for some comment to explain the paraniod.

> +	if (r == RET_PF_EMULATE && fault.is_private) {
> +		kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
> +		return -EFAULT;
> +	}
> +
>   	if (fault.write_fault_to_shadow_pgtable && emulation_type)
>   		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
>
Gupta, Pankaj March 7, 2024, 12:52 p.m. UTC | #2
> Exit to userspace with -EFAULT / KVM_EXIT_MEMORY_FAULT if a private fault
> triggers emulation of any kind, as KVM doesn't currently support emulating
> access to guest private memory.  Practically speaking, private faults and
> emulation are already mutually exclusive, but there are edge cases upon
> edge cases where KVM can return RET_PF_EMULATE, and adding one last check

edge cases upon edge cases?

Just curious about the details of the edge cases scenarios?

> to harden against weird, unexpected combinations is inexpensive.
> 
> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/mmu/mmu.c          |  8 --------
>   arch/x86/kvm/mmu/mmu_internal.h | 13 +++++++++++++
>   2 files changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e4cc7f764980..e2fd74e06ff8 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4309,14 +4309,6 @@ static inline u8 kvm_max_level_for_order(int order)
>   	return PG_LEVEL_4K;
>   }
>   
> -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> -					      struct kvm_page_fault *fault)
> -{
> -	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> -				      PAGE_SIZE, fault->write, fault->exec,
> -				      fault->is_private);
> -}
> -
>   static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
>   				   struct kvm_page_fault *fault)
>   {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 0669a8a668ca..0eea6c5a824d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -279,6 +279,14 @@ enum {
>   	RET_PF_SPURIOUS,
>   };
>   
> +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> +						     struct kvm_page_fault *fault)
> +{
> +	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> +				      PAGE_SIZE, fault->write, fault->exec,
> +				      fault->is_private);
> +}
> +
>   static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   					u32 err, bool prefetch, int *emulation_type)
>   {
> @@ -320,6 +328,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   	else
>   		r = vcpu->arch.mmu->page_fault(vcpu, &fault);
>   
> +	if (r == RET_PF_EMULATE && fault.is_private) {
> +		kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
> +		return -EFAULT;
> +	}
> +
>   	if (fault.write_fault_to_shadow_pgtable && emulation_type)
>   		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
>
Yan Zhao March 8, 2024, 4:22 a.m. UTC | #3
On Tue, Feb 27, 2024 at 06:41:32PM -0800, Sean Christopherson wrote:
> Exit to userspace with -EFAULT / KVM_EXIT_MEMORY_FAULT if a private fault
> triggers emulation of any kind, as KVM doesn't currently support emulating
> access to guest private memory.  Practically speaking, private faults and
> emulation are already mutually exclusive, but there are edge cases upon
> edge cases where KVM can return RET_PF_EMULATE, and adding one last check
> to harden against weird, unexpected combinations is inexpensive.
> 
> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          |  8 --------
>  arch/x86/kvm/mmu/mmu_internal.h | 13 +++++++++++++
>  2 files changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e4cc7f764980..e2fd74e06ff8 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4309,14 +4309,6 @@ static inline u8 kvm_max_level_for_order(int order)
>  	return PG_LEVEL_4K;
>  }
>  
> -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> -					      struct kvm_page_fault *fault)
> -{
> -	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> -				      PAGE_SIZE, fault->write, fault->exec,
> -				      fault->is_private);
> -}
> -
>  static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
>  				   struct kvm_page_fault *fault)
>  {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 0669a8a668ca..0eea6c5a824d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -279,6 +279,14 @@ enum {
>  	RET_PF_SPURIOUS,
>  };
>  
> +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> +						     struct kvm_page_fault *fault)
> +{
> +	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
> +				      PAGE_SIZE, fault->write, fault->exec,
> +				      fault->is_private);
> +}
> +
>  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  					u32 err, bool prefetch, int *emulation_type)
>  {
> @@ -320,6 +328,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	else
>  		r = vcpu->arch.mmu->page_fault(vcpu, &fault);
>  
> +	if (r == RET_PF_EMULATE && fault.is_private) {
Should we just check VM type + RET_PF_EMULATE, and abort?
If r is RET_PF_EMULATE, and fault is caused by accesing a shared address,
the emulation code could still meet error if guest page table pages are in
private memory, right?

> +		kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
> +		return -EFAULT;
> +	}
> +
>  	if (fault.write_fault_to_shadow_pgtable && emulation_type)
>  		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
>  
> -- 
> 2.44.0.278.ge034bb2e1d-goog
>
Binbin Wu March 12, 2024, 2:59 a.m. UTC | #4
On 3/7/2024 8:52 PM, Gupta, Pankaj wrote:
>> Exit to userspace with -EFAULT / KVM_EXIT_MEMORY_FAULT if a private 
>> fault
>> triggers emulation of any kind, as KVM doesn't currently support 
>> emulating
>> access to guest private memory.  Practically speaking, private faults 
>> and
>> emulation are already mutually exclusive, but there are edge cases upon
>> edge cases where KVM can return RET_PF_EMULATE, and adding one last 
>> check
>
> edge cases upon edge cases?
>
> Just curious about the details of the edge cases scenarios?

Same question out of curiosity.

>
>> to harden against weird, unexpected combinations is inexpensive.
>>
>> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>> ---
>>   arch/x86/kvm/mmu/mmu.c          |  8 --------
>>   arch/x86/kvm/mmu/mmu_internal.h | 13 +++++++++++++
>>   2 files changed, 13 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index e4cc7f764980..e2fd74e06ff8 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -4309,14 +4309,6 @@ static inline u8 kvm_max_level_for_order(int 
>> order)
>>       return PG_LEVEL_4K;
>>   }
>>   -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
>> -                          struct kvm_page_fault *fault)
>> -{
>> -    kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
>> -                      PAGE_SIZE, fault->write, fault->exec,
>> -                      fault->is_private);
>> -}
>> -
>>   static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
>>                      struct kvm_page_fault *fault)
>>   {
>> diff --git a/arch/x86/kvm/mmu/mmu_internal.h 
>> b/arch/x86/kvm/mmu/mmu_internal.h
>> index 0669a8a668ca..0eea6c5a824d 100644
>> --- a/arch/x86/kvm/mmu/mmu_internal.h
>> +++ b/arch/x86/kvm/mmu/mmu_internal.h
>> @@ -279,6 +279,14 @@ enum {
>>       RET_PF_SPURIOUS,
>>   };
>>   +static inline void kvm_mmu_prepare_memory_fault_exit(struct 
>> kvm_vcpu *vcpu,
>> +                             struct kvm_page_fault *fault)
>> +{
>> +    kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
>> +                      PAGE_SIZE, fault->write, fault->exec,
>> +                      fault->is_private);
>> +}
>> +
>>   static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, 
>> gpa_t cr2_or_gpa,
>>                       u32 err, bool prefetch, int *emulation_type)
>>   {
>> @@ -320,6 +328,11 @@ static inline int kvm_mmu_do_page_fault(struct 
>> kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>       else
>>           r = vcpu->arch.mmu->page_fault(vcpu, &fault);
>>   +    if (r == RET_PF_EMULATE && fault.is_private) {
>> +        kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
>> +        return -EFAULT;
>> +    }
>> +
>>       if (fault.write_fault_to_shadow_pgtable && emulation_type)
>>           *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
>
>
Sean Christopherson April 4, 2024, 4:38 p.m. UTC | #5
On Tue, Mar 12, 2024, Binbin Wu wrote:
> 
> On 3/7/2024 8:52 PM, Gupta, Pankaj wrote:
> > > Exit to userspace with -EFAULT / KVM_EXIT_MEMORY_FAULT if a private fault
> > > triggers emulation of any kind, as KVM doesn't currently support
> > > emulating access to guest private memory.  Practically speaking, private
> > > faults and emulation are already mutually exclusive, but there are edge
> > > cases upon edge cases where KVM can return RET_PF_EMULATE, and adding one
> > > last check
> > 
> > edge cases upon edge cases?
> > 
> > Just curious about the details of the edge cases scenarios?
> 
> Same question out of curiosity.

Accesses that hit the APIC-access page and gfns that are write-tracked, are the
two most likely candidates.  Even plain old emulated MMIO falls into this bucket,
e.g. if KVM botched things and generated a RSVD fault on a private mapping.  I'll
reword that line to

  faults and emulation are already mutually exclusive, but there are many
  flows that can result in KVM returning RET_PF_EMULATE, and adding one last
  check to harden against weird, unexpected combinations and/or KVM bugs is
  inexpensive.

to make it sound less dramatic/hand-wavy.
Sean Christopherson April 4, 2024, 4:45 p.m. UTC | #6
On Fri, Mar 08, 2024, Yan Zhao wrote:
> On Tue, Feb 27, 2024 at 06:41:32PM -0800, Sean Christopherson wrote:
> > @@ -320,6 +328,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> >  	else
> >  		r = vcpu->arch.mmu->page_fault(vcpu, &fault);
> >  
> > +	if (r == RET_PF_EMULATE && fault.is_private) {
> Should we just check VM type + RET_PF_EMULATE, and abort?

No, the goal here is purely to ensure that emulation is never triggered for
private memory.  Guarding against attempting emulation for a VM type that doesn't
support emulation at all is something different.

And more concretely, as of this commit, all VM types that support private memory
(i.e. SW_PROTECTED_VM) support emulation, just not for private memory.

> If r is RET_PF_EMULATE, and fault is caused by accesing a shared address,
> the emulation code could still meet error if guest page table pages are in
> private memory, right?

Yes, which is why I squeezed in a documentation update for v6.8 to make it super
clear that SW_PROTECTED_VM is a development vehicle, i.e. that trying to use it
to run a real VM is all but guaranteed to cause explosions.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e4cc7f764980..e2fd74e06ff8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4309,14 +4309,6 @@  static inline u8 kvm_max_level_for_order(int order)
 	return PG_LEVEL_4K;
 }
 
-static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
-					      struct kvm_page_fault *fault)
-{
-	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
-				      PAGE_SIZE, fault->write, fault->exec,
-				      fault->is_private);
-}
-
 static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
 				   struct kvm_page_fault *fault)
 {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 0669a8a668ca..0eea6c5a824d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -279,6 +279,14 @@  enum {
 	RET_PF_SPURIOUS,
 };
 
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+						     struct kvm_page_fault *fault)
+{
+	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+				      PAGE_SIZE, fault->write, fault->exec,
+				      fault->is_private);
+}
+
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 					u32 err, bool prefetch, int *emulation_type)
 {
@@ -320,6 +328,11 @@  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	else
 		r = vcpu->arch.mmu->page_fault(vcpu, &fault);
 
+	if (r == RET_PF_EMULATE && fault.is_private) {
+		kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+		return -EFAULT;
+	}
+
 	if (fault.write_fault_to_shadow_pgtable && emulation_type)
 		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;