diff mbox

[v3,05/13] nEPT: MMU context for nested EPT

Message ID 1368939152-11406-5-git-send-email-jun.nakajima@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nakajima, Jun May 19, 2013, 4:52 a.m. UTC
From: Nadav Har'El <nyh@il.ibm.com>

KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
---
 arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 92 insertions(+), 1 deletion(-)

Comments

Xiao Guangrong May 21, 2013, 8:50 a.m. UTC | #1
On 05/19/2013 12:52 PM, Jun Nakajima wrote:
> From: Nadav Har'El <nyh@il.ibm.com>
> 
> KVM's existing shadow MMU code already supports nested TDP. To use it, we
> need to set up a new "MMU context" for nested EPT, and create a few callbacks
> for it (nested_ept_*()). This context should also use the EPT versions of
> the page table access functions (defined in the previous patch).
> Then, we need to switch back and forth between this nested context and the
> regular MMU context when switching between L1 and L2 (when L1 runs this L2
> with EPT).
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> ---
>  arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu.h |  1 +
>  arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 92 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 6c1670f..37f8d7f 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
>  }
>  EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
> 
> +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
> +{
> +	ASSERT(vcpu);
> +	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
> +
> +	context->shadow_root_level = kvm_x86_ops->get_tdp_level();

That means L1 guest always uses page-walk length == 4? But in your previous patch,
it can be 2.

> +
> +	context->nx = is_nx(vcpu); /* TODO: ? */

Hmm? EPT always support NX.

> +	context->new_cr3 = paging_new_cr3;
> +	context->page_fault = EPT_page_fault;
> +	context->gva_to_gpa = EPT_gva_to_gpa;
> +	context->sync_page = EPT_sync_page;
> +	context->invlpg = EPT_invlpg;
> +	context->update_pte = EPT_update_pte;
> +	context->free = paging_free;
> +	context->root_level = context->shadow_root_level;
> +	context->root_hpa = INVALID_PAGE;
> +	context->direct_map = false;
> +
> +	/* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
> +	   something different.
> +	 */

Exactly. :)

> +	reset_rsvds_bits_mask(vcpu, context);
> +
> +
> +	/* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
> +	   they are done, or why they write to vcpu->arch.mmu and not context
> +	 */
> +	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
> +	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
> +	vcpu->arch.mmu.base_role.smep_andnot_wp =
> +		kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
> +		!is_write_protection(vcpu);

I guess we need not care these since the permission of EPT page does not depend
on these.

> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
> +
>  static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
>  {
>  	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 2adcbc2..8fc94dd 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
>  int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
>  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
> +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
> 
>  static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
>  {
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index fb9cae5..a88432f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
>  	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
>  }
> 
> +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
> +{
> +	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
> +}
> +
>  static inline bool is_exception(u32 intr_info)
>  {
>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> @@ -7311,6 +7316,46 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
>  		entry->ecx |= bit(X86_FEATURE_VMX);
>  }
> 
> +/* Callbacks for nested_ept_init_mmu_context: */
> +
> +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
> +{
> +	/* return the page table to be shadowed - in our case, EPT12 */
> +	return get_vmcs12(vcpu)->ept_pointer;
> +}
> +
> +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
> +	struct x86_exception *fault)
> +{
> +	struct vmcs12 *vmcs12;
> +	nested_vmx_vmexit(vcpu);
> +	vmcs12 = get_vmcs12(vcpu);
> +	/*
> +	 * Note no need to set vmcs12->vm_exit_reason as it is already copied
> +	 * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
> +	 */
> +	vmcs12->exit_qualification = fault->error_code;

Hmm, you directly copy the error code from FNAME(walk_addr_generic),
but its format is different and i did not see you cook the error code
in the previous patches.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nakajima, Jun May 21, 2013, 10:30 p.m. UTC | #2
On Tue, May 21, 2013 at 1:50 AM, Xiao Guangrong
<xiaoguangrong@linux.vnet.ibm.com> wrote:
> On 05/19/2013 12:52 PM, Jun Nakajima wrote:
>> From: Nadav Har'El <nyh@il.ibm.com>
>>
>> KVM's existing shadow MMU code already supports nested TDP. To use it, we
>> need to set up a new "MMU context" for nested EPT, and create a few callbacks
>> for it (nested_ept_*()). This context should also use the EPT versions of
>> the page table access functions (defined in the previous patch).
>> Then, we need to switch back and forth between this nested context and the
>> regular MMU context when switching between L1 and L2 (when L1 runs this L2
>> with EPT).
>>
>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
>> ---
>>  arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++
>>  arch/x86/kvm/mmu.h |  1 +
>>  arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 92 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 6c1670f..37f8d7f 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
>>
>> +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
>> +{
>> +     ASSERT(vcpu);
>> +     ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
>> +
>> +     context->shadow_root_level = kvm_x86_ops->get_tdp_level();
>
> That means L1 guest always uses page-walk length == 4? But in your previous patch,
> it can be 2.

We want to support "page-walk length == 4" only.

>
>> +
>> +     context->nx = is_nx(vcpu); /* TODO: ? */
>
> Hmm? EPT always support NX.
>
>> +     context->new_cr3 = paging_new_cr3;
>> +     context->page_fault = EPT_page_fault;
>> +     context->gva_to_gpa = EPT_gva_to_gpa;
>> +     context->sync_page = EPT_sync_page;
>> +     context->invlpg = EPT_invlpg;
>> +     context->update_pte = EPT_update_pte;
>> +     context->free = paging_free;
>> +     context->root_level = context->shadow_root_level;
>> +     context->root_hpa = INVALID_PAGE;
>> +     context->direct_map = false;
>> +
>> +     /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
>> +        something different.
>> +      */
>
> Exactly. :)
>
>> +     reset_rsvds_bits_mask(vcpu, context);
>> +
>> +
>> +     /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
>> +        they are done, or why they write to vcpu->arch.mmu and not context
>> +      */
>> +     vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
>> +     vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
>> +     vcpu->arch.mmu.base_role.smep_andnot_wp =
>> +             kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
>> +             !is_write_protection(vcpu);
>
> I guess we need not care these since the permission of EPT page does not depend
> on these.

Right. I'll clean up this.

>
>> +
>> +     return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
>> +
>>  static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
>>  {
>>       int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 2adcbc2..8fc94dd 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
>>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
>>  int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
>>  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
>> +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
>>
>>  static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
>>  {
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index fb9cae5..a88432f 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
>>       return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
>>  }
>>
>> +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
>> +{
>> +     return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
>> +}
>> +
>>  static inline bool is_exception(u32 intr_info)
>>  {
>>       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -7311,6 +7316,46 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
>>               entry->ecx |= bit(X86_FEATURE_VMX);
>>  }
>>
>> +/* Callbacks for nested_ept_init_mmu_context: */
>> +
>> +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
>> +{
>> +     /* return the page table to be shadowed - in our case, EPT12 */
>> +     return get_vmcs12(vcpu)->ept_pointer;
>> +}
>> +
>> +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
>> +     struct x86_exception *fault)
>> +{
>> +     struct vmcs12 *vmcs12;
>> +     nested_vmx_vmexit(vcpu);
>> +     vmcs12 = get_vmcs12(vcpu);
>> +     /*
>> +      * Note no need to set vmcs12->vm_exit_reason as it is already copied
>> +      * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
>> +      */
>> +     vmcs12->exit_qualification = fault->error_code;
>
> Hmm, you directly copy the error code from FNAME(walk_addr_generic),
> but its format is different and i did not see you cook the error code
> in the previous patches.
>

Right. Basically this is the original code from Nadav, and 12, 13
fix/cook the error code.

--
Jun
Intel Open Source Technology Center
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6c1670f..37f8d7f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3653,6 +3653,44 @@  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+	context->nx = is_nx(vcpu); /* TODO: ? */
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = EPT_page_fault;
+	context->gva_to_gpa = EPT_gva_to_gpa;
+	context->sync_page = EPT_sync_page;
+	context->invlpg = EPT_invlpg;
+	context->update_pte = EPT_update_pte;
+	context->free = paging_free;
+	context->root_level = context->shadow_root_level;
+	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
+
+	/* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+	   something different.
+	 */
+	reset_rsvds_bits_mask(vcpu, context);
+
+
+	/* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+	   they are done, or why they write to vcpu->arch.mmu and not context
+	 */
+	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.smep_andnot_wp =
+		kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+		!is_write_protection(vcpu);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2..8fc94dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fb9cae5..a88432f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,6 +1045,11 @@  static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -7311,6 +7316,46 @@  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+	/* return the page table to be shadowed - in our case, EPT12 */
+	return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+	struct x86_exception *fault)
+{
+	struct vmcs12 *vmcs12;
+	nested_vmx_vmexit(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+	/*
+	 * Note no need to set vmcs12->vm_exit_reason as it is already copied
+	 * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+	 */
+	vmcs12->exit_qualification = fault->error_code;
+	vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7531,6 +7576,11 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmx_flush_tlb(vcpu);
 	}
 
+	if (nested_cpu_has_ept(vmcs12)) {
+		kvm_mmu_unload(vcpu);
+		nested_ept_init_mmu_context(vcpu);
+	}
+
 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->guest_ia32_efer;
 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7975,7 +8025,9 @@  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
 	kvm_set_cr4(vcpu, vmcs12->host_cr4);
 
-	/* shadow page tables on either EPT or shadow page tables */
+	if (nested_cpu_has_ept(vmcs12))
+		nested_ept_uninit_mmu_context(vcpu);
+
 	kvm_set_cr3(vcpu, vmcs12->host_cr3);
 	kvm_mmu_reset_context(vcpu);