diff mbox series

[4/7] KVM: nVMX: move inject_page_fault tweak to .complete_mmu_init

Message ID 20210217145718.1217358-5-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: random nested fixes | expand

Commit Message

Maxim Levitsky Feb. 17, 2021, 2:57 p.m. UTC
This fixes a (mostly theoretical) bug which can happen if ept=0
on host and we run a nested guest which triggers a mmu context
reset while running nested.
In this case the .inject_page_fault callback will be lost.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 8 +-------
 arch/x86/kvm/vmx/nested.h | 1 +
 arch/x86/kvm/vmx/vmx.c    | 5 ++++-
 3 files changed, 6 insertions(+), 8 deletions(-)

Comments

Sean Christopherson Feb. 17, 2021, 5:29 p.m. UTC | #1
On Wed, Feb 17, 2021, Maxim Levitsky wrote:
> This fixes a (mostly theoretical) bug which can happen if ept=0
> on host and we run a nested guest which triggers a mmu context
> reset while running nested.
> In this case the .inject_page_fault callback will be lost.
>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 8 +-------
>  arch/x86/kvm/vmx/nested.h | 1 +
>  arch/x86/kvm/vmx/vmx.c    | 5 ++++-
>  3 files changed, 6 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 0b6dab6915a3..f9de729dbea6 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -419,7 +419,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
>  }
>  
>  
> -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
>  		struct x86_exception *fault)
>  {
>  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> @@ -2620,9 +2620,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>  		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
>  	}
>  
> -	if (!enable_ept)
> -		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
> -
>  	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
>  	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
>  				     vmcs12->guest_ia32_perf_global_ctrl)))
> @@ -4224,9 +4221,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>  	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
>  		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
>  
> -	if (!enable_ept)
> -		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;

Oof, please explicitly call out these types of side effects in the changelog,
it took me a while to piece together that this can be dropped because a MMU
reset is guaranteed and is also guaranteed to restore inject_page_fault.

I would even go so far as to say this particular line of code should be removed
in a separate commit.  Unless I'm overlooking something, this code is
effectively a nop, which means it doesn't need to be removed to make the bug fix
functionally correct.

All that being said, I'm pretty we can eliminate setting inject_page_fault
dynamically.  I think that would yield more maintainable code.  Following these
flows is a nightmare.  The change itself will be scarier, but I'm pretty sure
the end result will be a lot cleaner.

And I believe there's also a second bug that would be fixed by such an approach.
Doesn't vmx_inject_page_fault_nested() need to be used for the nested_mmu when
ept=1?  E.g. if the emulator injects a #PF to L2, L1 should still be able to
intercept the #PF even if L1 is using EPT.  This likely hasn't been noticed
because hypervisors typically don't intercept #PF when EPT is enabled.

Something like this (very incomplete):

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 30e9b0cb9abd..f957514a4d65 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4497,7 +4497,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->direct_map = true;
        context->get_guest_pgd = get_cr3;
        context->get_pdptr = kvm_pdptr_read;
-       context->inject_page_fault = kvm_inject_page_fault;

        if (!is_paging(vcpu)) {
                context->nx = false;
@@ -4687,7 +4686,6 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)

        context->get_guest_pgd     = get_cr3;
        context->get_pdptr         = kvm_pdptr_read;
-       context->inject_page_fault = kvm_inject_page_fault;
 }

 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -4701,7 +4699,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        g_context->mmu_role.as_u64 = new_role.as_u64;
        g_context->get_guest_pgd     = get_cr3;
        g_context->get_pdptr         = kvm_pdptr_read;
-       g_context->inject_page_fault = kvm_inject_page_fault;

        /*
         * L2 page tables are never shadowed, so there is no need to sync
@@ -5272,6 +5269,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
        if (ret)
                goto fail_allocate_root;

+       static_call(kvm_x86_mmu_create)(vcpu);
+
        return ret;
  fail_allocate_root:
        free_mmu_pages(&vcpu->arch.guest_mmu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a63da447ede9..aa6c48295117 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -425,15 +425,14 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 }


-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
                struct x86_exception *fault)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

-       WARN_ON(!is_guest_mode(vcpu));
-
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-               !to_vmx(vcpu)->nested.nested_run_pending) {
+       if (guest_mode(vcpu) &&
+           nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+           !to_vmx(vcpu)->nested.nested_run_pending) {
                vmcs12->vm_exit_intr_error_code = fault->error_code;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
@@ -2594,9 +2593,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
        }

-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
            WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                     vmcs12->guest_ia32_perf_global_ctrl)))
@@ -4198,9 +4194,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);

-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
        nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);

        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1204e5f0fe67..0e5ee22eea77 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3081,6 +3081,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vmx->emulation_required = emulation_required(vcpu);
 }

+static void vmx_mmu_create(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.root_mmu.inject_page_fault = vmx_inject_page_fault;
+       vcpu->arch.guest_mmu.inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.nested_mmu.inject_page_fault = vmx_inject_page_fault;
+}
+
 static int vmx_get_max_tdp_level(void)
 {
        if (cpu_has_vmx_ept_5levels())
@@ -7721,6 +7728,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {

        .write_l1_tsc_offset = vmx_write_l1_tsc_offset,

+       .mmu_create = vmx_mmu_create,
        .load_mmu_pgd = vmx_load_mmu_pgd,

        .check_intercept = vmx_check_intercept,

> -
>  	nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
>  
>  	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
> diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
> index 197148d76b8f..2ab279744d38 100644
> --- a/arch/x86/kvm/vmx/nested.h
> +++ b/arch/x86/kvm/vmx/nested.h
> @@ -36,6 +36,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
>  void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
>  bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
>  				 int size);
> +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,struct x86_exception *fault);
>  
>  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index bf6ef674d688..c43324df4877 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3254,7 +3254,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
>  
>  static void vmx_complete_mmu_init(struct kvm_vcpu *vcpu)
>  {
> -
> +	if (!enable_ept && is_guest_mode(vcpu)) {
> +		WARN_ON(mmu_is_nested(vcpu));
> +		vcpu->arch.mmu->inject_page_fault = vmx_inject_page_fault_nested;
> +	}
>  }
>  
>  static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> -- 
> 2.26.2
>
Paolo Bonzini Feb. 17, 2021, 5:37 p.m. UTC | #2
On 17/02/21 18:29, Sean Christopherson wrote:
> All that being said, I'm pretty we can eliminate setting 
> inject_page_fault dynamically. I think that would yield more 
> maintainable code. Following these flows is a nightmare. The change 
> itself will be scarier, but I'm pretty sure the end result will be a lot 
> cleaner.

I had a similar reaction, though my proposal was different.

The only thing we're changing in complete_mmu_init is the page fault 
callback for init_kvm_softmmu, so couldn't that be the callback directly 
(i.e. something like context->inject_page_fault = 
kvm_x86_ops.inject_softmmu_page_fault)?  And then adding is_guest_mode 
to the conditional that is already in vmx_inject_page_fault_nested and 
svm_inject_page_fault_nested.

That said, I'm also rusty on _why_ this code is needed.  Why isn't it 
enough to inject the exception normally, and let 
nested_vmx_check_exception decide whether to inject a vmexit to L1 or an 
exception into L2?

Also, bonus question which should have been in the 5/7 changelog: are 
there kvm-unit-tests testcases that fail with npt=0, and if not could we 
write one?  [Answer: the mode_switch testcase fails, but I haven't 
checked why].


Paolo
Sean Christopherson Feb. 17, 2021, 5:57 p.m. UTC | #3
On Wed, Feb 17, 2021, Paolo Bonzini wrote:
> On 17/02/21 18:29, Sean Christopherson wrote:
> > All that being said, I'm pretty we can eliminate setting
> > inject_page_fault dynamically. I think that would yield more
> > maintainable code. Following these flows is a nightmare. The change
> > itself will be scarier, but I'm pretty sure the end result will be a lot
> > cleaner.
> 
> I had a similar reaction, though my proposal was different.
> 
> The only thing we're changing in complete_mmu_init is the page fault
> callback for init_kvm_softmmu, so couldn't that be the callback directly
> (i.e. something like context->inject_page_fault =
> kvm_x86_ops.inject_softmmu_page_fault)?  And then adding is_guest_mode to
> the conditional that is already in vmx_inject_page_fault_nested and
> svm_inject_page_fault_nested.

Heh, that exact code crossed my mind as well.

> That said, I'm also rusty on _why_ this code is needed.  Why isn't it enough
> to inject the exception normally, and let nested_vmx_check_exception decide
> whether to inject a vmexit to L1 or an exception into L2?

Hmm, I suspect it was required at one point due to deficiencies elsewhere.
Handling this in the common fault handler logic does seem like the right
approach.

> Also, bonus question which should have been in the 5/7 changelog: are there
> kvm-unit-tests testcases that fail with npt=0, and if not could we write
> one?  [Answer: the mode_switch testcase fails, but I haven't checked why].
> 
> 
> Paolo
>
Paolo Bonzini Feb. 17, 2021, 6 p.m. UTC | #4
On 17/02/21 18:57, Sean Christopherson wrote:
>> That said, I'm also rusty on_why_  this code is needed.  Why isn't it enough
>> to inject the exception normally, and let nested_vmx_check_exception decide
>> whether to inject a vmexit to L1 or an exception into L2?
>
> Hmm, I suspect it was required at one point due to deficiencies elsewhere.
> Handling this in the common fault handler logic does seem like the right
> approach.

I think I'm going to merge a variant of patch 5 just to unbreak things. 
But we should get rid of all this because after the exception payload 
changes we shouldn't need it.

Paolo

>> Also, bonus question which should have been in the 5/7 changelog: are there
>> kvm-unit-tests testcases that fail with npt=0, and if not could we write
>> one?  [Answer: the mode_switch testcase fails, but I haven't checked why].
Maxim Levitsky Feb. 17, 2021, 6:43 p.m. UTC | #5
On Wed, 2021-02-17 at 09:29 -0800, Sean Christopherson wrote:
> On Wed, Feb 17, 2021, Maxim Levitsky wrote:
> > This fixes a (mostly theoretical) bug which can happen if ept=0
> > on host and we run a nested guest which triggers a mmu context
> > reset while running nested.
> > In this case the .inject_page_fault callback will be lost.
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  arch/x86/kvm/vmx/nested.c | 8 +-------
> >  arch/x86/kvm/vmx/nested.h | 1 +
> >  arch/x86/kvm/vmx/vmx.c    | 5 ++++-
> >  3 files changed, 6 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index 0b6dab6915a3..f9de729dbea6 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -419,7 +419,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
> >  }
> >  
> >  
> > -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> > +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> >  		struct x86_exception *fault)
> >  {
> >  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> > @@ -2620,9 +2620,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> >  		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
> >  	}
> >  
> > -	if (!enable_ept)
> > -		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
> > -
> >  	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
> >  	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
> >  				     vmcs12->guest_ia32_perf_global_ctrl)))
> > @@ -4224,9 +4221,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
> >  	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
> >  		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
> >  
> > -	if (!enable_ept)
> > -		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
> 
> Oof, please explicitly call out these types of side effects in the changelog,
> it took me a while to piece together that this can be dropped because a MMU
> reset is guaranteed and is also guaranteed to restore inject_page_fault.
> 
> I would even go so far as to say this particular line of code should be removed
> in a separate commit.  Unless I'm overlooking something, this code is
> effectively a nop, which means it doesn't need to be removed to make the bug fix
> functionally correct.
> 
> All that being said, I'm pretty we can eliminate setting inject_page_fault
> dynamically.  I think that would yield more maintainable code.  Following these
> flows is a nightmare.  The change itself will be scarier, but I'm pretty sure
> the end result will be a lot cleaner.
> 
> And I believe there's also a second bug that would be fixed by such an approach.
> Doesn't vmx_inject_page_fault_nested() need to be used for the nested_mmu when
> ept=1?  E.g. if the emulator injects a #PF to L2, L1 should still be able to
> intercept the #PF even if L1 is using EPT.  This likely hasn't been noticed
> because hypervisors typically don't intercept #PF when EPT is enabled.

Let me explain what I know about this:
 
There are basically 3 cases:
 
1. npt/ept disabled in the host. In this case we have a single shadowing
and a nested hypervisor has to do its own shadowing on top of it.
In this case the MMU itself has to generate page faults (they are a result
of hardware page faults, but are completely different), and in case
of nesting these page faults have to be sometimes injected as VM exits.
 
2. npt/ept enabled on host and disabled in guest.
In this case we don't need to shadow anything, while the nested hypervisor
does need to do shadowing to run its guest.
In this case in fact it is likely that L1 intercepts the page faults,
however they are just reflected as is to it, like what nested_svm_exit_special
does (it does have a special case for async page fault which I need to investigate.

This is where the bug that you mention can happen. I haven’t checked how VMX
reflects the page faults to the nested guest also.
 
3. (the common case) npt/ept are enabled on both host and guest.
In this case walk_mmu is used for all the page faults which is actually
tweaked in the similar way (see nested_svm_init_mmu_context for example)


Also if the emulator injects the page fault, then indeed I think the
bug will happen.


Best regards and thanks for the feedback,
	Maxim Levitsky

> 
> Something like this (very incomplete):
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 30e9b0cb9abd..f957514a4d65 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4497,7 +4497,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
>         context->direct_map = true;
>         context->get_guest_pgd = get_cr3;
>         context->get_pdptr = kvm_pdptr_read;
> -       context->inject_page_fault = kvm_inject_page_fault;
> 
>         if (!is_paging(vcpu)) {
>                 context->nx = false;
> @@ -4687,7 +4686,6 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
> 
>         context->get_guest_pgd     = get_cr3;
>         context->get_pdptr         = kvm_pdptr_read;
> -       context->inject_page_fault = kvm_inject_page_fault;
>  }
> 
>  static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
> @@ -4701,7 +4699,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
>         g_context->mmu_role.as_u64 = new_role.as_u64;
>         g_context->get_guest_pgd     = get_cr3;
>         g_context->get_pdptr         = kvm_pdptr_read;
> -       g_context->inject_page_fault = kvm_inject_page_fault;
> 
>         /*
>          * L2 page tables are never shadowed, so there is no need to sync
> @@ -5272,6 +5269,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>         if (ret)
>                 goto fail_allocate_root;
> 
> +       static_call(kvm_x86_mmu_create)(vcpu);
> +
>         return ret;
>   fail_allocate_root:
>         free_mmu_pages(&vcpu->arch.guest_mmu);
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index a63da447ede9..aa6c48295117 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -425,15 +425,14 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
>  }
> 
> 
> -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> +static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
>                 struct x86_exception *fault)
>  {
>         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> 
> -       WARN_ON(!is_guest_mode(vcpu));
> -
> -       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
> -               !to_vmx(vcpu)->nested.nested_run_pending) {
> +       if (guest_mode(vcpu) &&
> +           nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
> +           !to_vmx(vcpu)->nested.nested_run_pending) {
>                 vmcs12->vm_exit_intr_error_code = fault->error_code;
>                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
>                                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
> @@ -2594,9 +2593,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
>         }
> 
> -       if (!enable_ept)
> -               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
> -
>         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
>             WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
>                                      vmcs12->guest_ia32_perf_global_ctrl)))
> @@ -4198,9 +4194,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
>                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
> 
> -       if (!enable_ept)
> -               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
> -
>         nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
> 
>         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 1204e5f0fe67..0e5ee22eea77 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3081,6 +3081,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
>         vmx->emulation_required = emulation_required(vcpu);
>  }
> 
> +static void vmx_mmu_create(struct kvm_vcpu *vcpu)
> +{
> +       vcpu->arch.root_mmu.inject_page_fault = vmx_inject_page_fault;
> +       vcpu->arch.guest_mmu.inject_page_fault = nested_ept_inject_page_fault;
> +       vcpu->arch.nested_mmu.inject_page_fault = vmx_inject_page_fault;
> +}
> +
>  static int vmx_get_max_tdp_level(void)
>  {
>         if (cpu_has_vmx_ept_5levels())
> @@ -7721,6 +7728,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
> 
>         .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
> 
> +       .mmu_create = vmx_mmu_create,
>         .load_mmu_pgd = vmx_load_mmu_pgd,
> 
>         .check_intercept = vmx_check_intercept,
> 
> > -
> >  	nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
> >  
> >  	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
> > diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
> > index 197148d76b8f..2ab279744d38 100644
> > --- a/arch/x86/kvm/vmx/nested.h
> > +++ b/arch/x86/kvm/vmx/nested.h
> > @@ -36,6 +36,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
> >  void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
> >  bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
> >  				 int size);
> > +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,struct x86_exception *fault);
> >  
> >  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
> >  {
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index bf6ef674d688..c43324df4877 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -3254,7 +3254,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
> >  
> >  static void vmx_complete_mmu_init(struct kvm_vcpu *vcpu)
> >  {
> > -
> > +	if (!enable_ept && is_guest_mode(vcpu)) {
> > +		WARN_ON(mmu_is_nested(vcpu));
> > +		vcpu->arch.mmu->inject_page_fault = vmx_inject_page_fault_nested;
> > +	}
> >  }
> >  
> >  static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> > -- 
> > 2.26.2
> >
Maxim Levitsky Feb. 17, 2021, 6:49 p.m. UTC | #6
On Wed, 2021-02-17 at 18:37 +0100, Paolo Bonzini wrote:
> On 17/02/21 18:29, Sean Christopherson wrote:
> > All that being said, I'm pretty we can eliminate setting 
> > inject_page_fault dynamically. I think that would yield more 
> > maintainable code. Following these flows is a nightmare. The change 
> > itself will be scarier, but I'm pretty sure the end result will be a lot 
> > cleaner.

I agree with that.

> 
> I had a similar reaction, though my proposal was different.
> 
> The only thing we're changing in complete_mmu_init is the page fault 
> callback for init_kvm_softmmu, so couldn't that be the callback directly 
> (i.e. something like context->inject_page_fault = 
> kvm_x86_ops.inject_softmmu_page_fault)?  And then adding is_guest_mode 
> to the conditional that is already in vmx_inject_page_fault_nested and 
> svm_inject_page_fault_nested.

I was thinking about this a well, I tried to make an as simple as possible
solution that doesn't make things worse.
> 
> That said, I'm also rusty on _why_ this code is needed.  Why isn't it 
> enough to inject the exception normally, and let 
> nested_vmx_check_exception decide whether to inject a vmexit to L1 or an 
> exception into L2?
> 
> Also, bonus question which should have been in the 5/7 changelog: are 
> there kvm-unit-tests testcases that fail with npt=0, and if not could we 
> write one?  [Answer: the mode_switch testcase fails, but I haven't 
> checked why].

I agree with all of this. I'll see why this code is needed (it is needed,
since I once removed it accidentaly on VMX, and it broke nesting with ept=0,
in exact the same way as it was broken on AMD).

I''l debug this a bit to see if I can make it work as you suggest.


Best regards,
	Maxim Levitsky
> 
> 
> Paolo
>
Paolo Bonzini Feb. 18, 2021, 9:45 a.m. UTC | #7
On 17/02/21 19:43, Maxim Levitsky wrote:
> 1. npt/ept disabled in the host. In this case we have a single shadowing
> and a nested hypervisor has to do its own shadowing on top of it.
> In this case the MMU itself has to generate page faults (they are a result
> of hardware page faults, but are completely different), and in case
> of nesting these page faults have to be sometimes injected as VM exits.
> 
> [...] Also if the emulator injects the page fault, then indeed I think the
> bug will happen.

But in both cases you (ought to) get an injected exception which then 
becomes a page fault vmexit at next check_nested_events.  That's the 
part that we are all collectively missing.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0b6dab6915a3..f9de729dbea6 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -419,7 +419,7 @@  static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 }
 
 
-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 		struct x86_exception *fault)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -2620,9 +2620,6 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 	}
 
-	if (!enable_ept)
-		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
 	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
 				     vmcs12->guest_ia32_perf_global_ctrl)))
@@ -4224,9 +4221,6 @@  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
 
-	if (!enable_ept)
-		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
 	nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
 
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 197148d76b8f..2ab279744d38 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -36,6 +36,7 @@  void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
 				 int size);
+void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,struct x86_exception *fault);
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bf6ef674d688..c43324df4877 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3254,7 +3254,10 @@  static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
 
 static void vmx_complete_mmu_init(struct kvm_vcpu *vcpu)
 {
-
+	if (!enable_ept && is_guest_mode(vcpu)) {
+		WARN_ON(mmu_is_nested(vcpu));
+		vcpu->arch.mmu->inject_page_fault = vmx_inject_page_fault_nested;
+	}
 }
 
 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)