Message ID | 20170728195245.1018-4-bsd@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) > +{ > + return nested_cpu_has_vmfunc(vmcs12) && > + (vmcs12->vm_function_control & > + VMX_VMFUNC_EPTP_SWITCHING); > +} > + > static inline bool is_nmi(u32 intr_info) > { > return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) > @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) > if (cpu_has_vmx_vmfunc()) { > vmx->nested.nested_vmx_secondary_ctls_high |= > SECONDARY_EXEC_ENABLE_VMFUNC; > - vmx->nested.nested_vmx_vmfunc_controls = 0; > + /* > + * Advertise EPTP switching unconditionally > + * since we emulate it > + */ > + vmx->nested.nested_vmx_vmfunc_controls = > + VMX_VMFUNC_EPTP_SWITCHING; Should this only be advertised, if enable_ept is set (if the guest also sees/can use SECONDARY_EXEC_ENABLE_EPT)? > } > > /* > @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) > return 1; > } > > +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address) check_..._valid -> valid_ept_address() ? > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + u64 mask = VMX_EPT_RWX_MASK; > + int maxphyaddr = cpuid_maxphyaddr(vcpu); > + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; > + > + /* Check for execute_only validity */ > + if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) { > + if (!(vmx->nested.nested_vmx_ept_caps & > + VMX_EPT_EXECUTE_ONLY_BIT)) > + return false; > + } > + > + /* Bits 5:3 must be 3 */ > + if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW) > + return false; > + > + /* Reserved bits should not be set */ > + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) > + return false; > + > + /* AD, if set, should be supported */ > + if ((address & VMX_EPT_AD_ENABLE_BIT)) { > + if (!enable_ept_ad_bits) > + return false; > + mmu->ept_ad = true; > + } else > + mmu->ept_ad = false; I wouldn't expect a "check" function to modify the mmu. Can you move modifying the mmu outside of this function (leaving the enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?) > + > + return true; > +} > + > +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12) > +{ > + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; > + u64 *l1_eptp_list, address; > + struct page *page; > + > + if (!nested_cpu_has_eptp_switching(vmcs12) || > + !nested_cpu_has_ept(vmcs12)) > + return 1; > + > + if (index >= VMFUNC_EPTP_ENTRIES) > + return 1; > + > + page = nested_get_page(vcpu, vmcs12->eptp_list_address); > + if (!page) > + return 1; > + > + l1_eptp_list = kmap(page); > + address = l1_eptp_list[index]; > + > + /* > + * If the (L2) guest does a vmfunc to the currently > + * active ept pointer, we don't have to do anything else > + */ > + if (vmcs12->ept_pointer != address) { > + if (!check_ept_address_valid(vcpu, address)) { > + kunmap(page); > + nested_release_page_clean(page); > + return 1; > + } > + kvm_mmu_unload(vcpu); > + vmcs12->ept_pointer = address; > + /* > + * TODO: Check what's the correct approach in case > + * mmu reload fails. Currently, we just let the next > + * reload potentially fail > + */ > + kvm_mmu_reload(vcpu); So, what actually happens if this generates a tripple fault? I guess we will kill the (nested) hypervisor? > + } > + > + kunmap(page); > + nested_release_page_clean(page); > + return 0; > +} > + > static int handle_vmfunc(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > @@ -7786,7 +7879,16 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) > vmcs12 = get_vmcs12(vcpu); > if ((vmcs12->vm_function_control & (1 << function)) == 0) > goto fail; > - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); > + > + switch (function) { > + case 0: > + if (nested_vmx_eptp_switching(vcpu, vmcs12)) > + goto fail; > + break; > + default: > + goto fail; > + } > + return kvm_skip_emulated_instruction(vcpu); > > fail: > nested_vmx_vmexit(vcpu, vmx->exit_reason, > @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > vmx->nested.nested_vmx_entry_ctls_high)) > return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > > - if (nested_cpu_has_vmfunc(vmcs12) && > - (vmcs12->vm_function_control & > - ~vmx->nested.nested_vmx_vmfunc_controls)) > - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > + if (nested_cpu_has_vmfunc(vmcs12)) { > + if (vmcs12->vm_function_control & > + ~vmx->nested.nested_vmx_vmfunc_controls) > + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > + > + if (nested_cpu_has_eptp_switching(vmcs12)) { > + if (!nested_cpu_has_ept(vmcs12) || > + (vmcs12->eptp_list_address >> > + cpuid_maxphyaddr(vcpu)) || > + !IS_ALIGNED(vmcs12->eptp_list_address, 4096)) > + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > + } > + } > + > > if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) > return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >
Hi David, David Hildenbrand <david@redhat.com> writes: >> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) >> +{ >> + return nested_cpu_has_vmfunc(vmcs12) && >> + (vmcs12->vm_function_control & >> + VMX_VMFUNC_EPTP_SWITCHING); >> +} >> + >> static inline bool is_nmi(u32 intr_info) >> { >> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) >> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) >> if (cpu_has_vmx_vmfunc()) { >> vmx->nested.nested_vmx_secondary_ctls_high |= >> SECONDARY_EXEC_ENABLE_VMFUNC; >> - vmx->nested.nested_vmx_vmfunc_controls = 0; >> + /* >> + * Advertise EPTP switching unconditionally >> + * since we emulate it >> + */ >> + vmx->nested.nested_vmx_vmfunc_controls = >> + VMX_VMFUNC_EPTP_SWITCHING; > > Should this only be advertised, if enable_ept is set (if the guest also > sees/can use SECONDARY_EXEC_ENABLE_EPT)? This represents the function control MSR, which on the hardware is a RO value. The checks for enable_ept and such are somewhere else. >> } >> >> /* >> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) >> return 1; >> } >> >> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address) > > check_..._valid -> valid_ept_address() ? I think either of the names is fine and I would prefer not to respin unless you feel really strongly about it :) > >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + u64 mask = VMX_EPT_RWX_MASK; >> + int maxphyaddr = cpuid_maxphyaddr(vcpu); >> + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; >> + >> + /* Check for execute_only validity */ >> + if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) { >> + if (!(vmx->nested.nested_vmx_ept_caps & >> + VMX_EPT_EXECUTE_ONLY_BIT)) >> + return false; >> + } >> + >> + /* Bits 5:3 must be 3 */ >> + if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW) >> + return false; >> + >> + /* Reserved bits should not be set */ >> + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) >> + return false; >> + >> + /* AD, if set, should be supported */ >> + if ((address & VMX_EPT_AD_ENABLE_BIT)) { >> + if (!enable_ept_ad_bits) >> + return false; >> + mmu->ept_ad = true; >> + } else >> + mmu->ept_ad = false; > > I wouldn't expect a "check" function to modify the mmu. Can you move > modifying the mmu outside of this function (leaving the > enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad > _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?) > Well, the correct thing to do is have a wrapper around it in mmu.c without directly calling here and also call this function before nested_mmu is initialized. I am working on a separate patch for this btw. It seems to me setting mmu->ept_ad after kvm_mmu_unload is unnecessary since it's already being set only if everything else succeeds. kvm_mmu_unload() isn't affected by the setting of this flag if I understand correctly. >> + >> + return true; >> +} >> + >> +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, >> + struct vmcs12 *vmcs12) >> +{ >> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; >> + u64 *l1_eptp_list, address; >> + struct page *page; >> + >> + if (!nested_cpu_has_eptp_switching(vmcs12) || >> + !nested_cpu_has_ept(vmcs12)) >> + return 1; >> + >> + if (index >= VMFUNC_EPTP_ENTRIES) >> + return 1; >> + >> + page = nested_get_page(vcpu, vmcs12->eptp_list_address); >> + if (!page) >> + return 1; >> + >> + l1_eptp_list = kmap(page); >> + address = l1_eptp_list[index]; >> + >> + /* >> + * If the (L2) guest does a vmfunc to the currently >> + * active ept pointer, we don't have to do anything else >> + */ >> + if (vmcs12->ept_pointer != address) { >> + if (!check_ept_address_valid(vcpu, address)) { >> + kunmap(page); >> + nested_release_page_clean(page); >> + return 1; >> + } >> + kvm_mmu_unload(vcpu); >> + vmcs12->ept_pointer = address; >> + /* >> + * TODO: Check what's the correct approach in case >> + * mmu reload fails. Currently, we just let the next >> + * reload potentially fail >> + */ >> + kvm_mmu_reload(vcpu); > > So, what actually happens if this generates a tripple fault? I guess we > will kill the (nested) hypervisor? Yes. Not sure what's the right thing to do is though... Bandan >> + } >> + >> + kunmap(page); >> + nested_release_page_clean(page); >> + return 0; >> +} >> + >> static int handle_vmfunc(struct kvm_vcpu *vcpu) >> { >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> @@ -7786,7 +7879,16 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >> vmcs12 = get_vmcs12(vcpu); >> if ((vmcs12->vm_function_control & (1 << function)) == 0) >> goto fail; >> - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); >> + >> + switch (function) { >> + case 0: >> + if (nested_vmx_eptp_switching(vcpu, vmcs12)) >> + goto fail; >> + break; >> + default: >> + goto fail; >> + } >> + return kvm_skip_emulated_instruction(vcpu); >> >> fail: >> nested_vmx_vmexit(vcpu, vmx->exit_reason, >> @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) >> vmx->nested.nested_vmx_entry_ctls_high)) >> return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> >> - if (nested_cpu_has_vmfunc(vmcs12) && >> - (vmcs12->vm_function_control & >> - ~vmx->nested.nested_vmx_vmfunc_controls)) >> - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> + if (nested_cpu_has_vmfunc(vmcs12)) { >> + if (vmcs12->vm_function_control & >> + ~vmx->nested.nested_vmx_vmfunc_controls) >> + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> + >> + if (nested_cpu_has_eptp_switching(vmcs12)) { >> + if (!nested_cpu_has_ept(vmcs12) || >> + (vmcs12->eptp_list_address >> >> + cpuid_maxphyaddr(vcpu)) || >> + !IS_ALIGNED(vmcs12->eptp_list_address, 4096)) >> + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> + } >> + } >> + >> >> if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) >> return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >>
On 31.07.2017 21:32, Bandan Das wrote: > Hi David, > > David Hildenbrand <david@redhat.com> writes: > >>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) >>> +{ >>> + return nested_cpu_has_vmfunc(vmcs12) && >>> + (vmcs12->vm_function_control & >>> + VMX_VMFUNC_EPTP_SWITCHING); >>> +} >>> + >>> static inline bool is_nmi(u32 intr_info) >>> { >>> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) >>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) >>> if (cpu_has_vmx_vmfunc()) { >>> vmx->nested.nested_vmx_secondary_ctls_high |= >>> SECONDARY_EXEC_ENABLE_VMFUNC; >>> - vmx->nested.nested_vmx_vmfunc_controls = 0; >>> + /* >>> + * Advertise EPTP switching unconditionally >>> + * since we emulate it >>> + */ >>> + vmx->nested.nested_vmx_vmfunc_controls = >>> + VMX_VMFUNC_EPTP_SWITCHING; >> >> Should this only be advertised, if enable_ept is set (if the guest also >> sees/can use SECONDARY_EXEC_ENABLE_EPT)? > > This represents the function control MSR, which on the hardware is > a RO value. The checks for enable_ept and such are somewhere else. This is the if (!nested_cpu_has_eptp_switching(vmcs12) || !nested_cpu_has_ept(vmcs12)) return 1; check then, I assume. Makes sense. > >>> } >>> >>> /* >>> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) >>> return 1; >>> } >>> >>> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address) >> >> check_..._valid -> valid_ept_address() ? > > I think either of the names is fine and I would prefer not > to respin unless you feel really strongly about it :) Sure, if you have to respin, you can fix this up. > >> >>> +{ >>> + struct vcpu_vmx *vmx = to_vmx(vcpu); >>> + u64 mask = VMX_EPT_RWX_MASK; >>> + int maxphyaddr = cpuid_maxphyaddr(vcpu); >>> + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; >>> + >>> + /* Check for execute_only validity */ >>> + if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) { >>> + if (!(vmx->nested.nested_vmx_ept_caps & >>> + VMX_EPT_EXECUTE_ONLY_BIT)) >>> + return false; >>> + } >>> + >>> + /* Bits 5:3 must be 3 */ >>> + if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW) >>> + return false; >>> + >>> + /* Reserved bits should not be set */ >>> + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) >>> + return false; >>> + >>> + /* AD, if set, should be supported */ >>> + if ((address & VMX_EPT_AD_ENABLE_BIT)) { >>> + if (!enable_ept_ad_bits) >>> + return false; >>> + mmu->ept_ad = true; >>> + } else >>> + mmu->ept_ad = false; >> >> I wouldn't expect a "check" function to modify the mmu. Can you move >> modifying the mmu outside of this function (leaving the >> enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad >> _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?) >> > > Well, the correct thing to do is have a wrapper around it in mmu.c > without directly calling here and also call this function before > nested_mmu is initialized. I am working on a separate patch for this btw. Sounds good. Also thought that encapsulating this might be a good option. > It seems to me setting mmu->ept_ad after kvm_mmu_unload is unnecessary > since it's already being set only if everything else succeeds. > kvm_mmu_unload() isn't affected by the setting of this flag if I understand > correctly. It looks at least cleaner to set everything up after the unload has happened (and could avoid future bugs, if that unload would every rely on that setting!). + we can reuse that function more easily (e.g. vm entry). But that's just my personal opinion. Feel free to ignore. > >>> + >>> + return true; >>> +} >>> + >>> +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, >>> + struct vmcs12 *vmcs12) >>> +{ >>> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; >>> + u64 *l1_eptp_list, address; >>> + struct page *page; >>> + >>> + if (!nested_cpu_has_eptp_switching(vmcs12) || >>> + !nested_cpu_has_ept(vmcs12)) >>> + return 1; >>> + >>> + if (index >= VMFUNC_EPTP_ENTRIES) >>> + return 1; >>> + >>> + page = nested_get_page(vcpu, vmcs12->eptp_list_address); >>> + if (!page) >>> + return 1; >>> + >>> + l1_eptp_list = kmap(page); >>> + address = l1_eptp_list[index]; >>> + >>> + /* >>> + * If the (L2) guest does a vmfunc to the currently >>> + * active ept pointer, we don't have to do anything else >>> + */ >>> + if (vmcs12->ept_pointer != address) { >>> + if (!check_ept_address_valid(vcpu, address)) { >>> + kunmap(page); >>> + nested_release_page_clean(page); >>> + return 1; >>> + } >>> + kvm_mmu_unload(vcpu); >>> + vmcs12->ept_pointer = address; >>> + /* >>> + * TODO: Check what's the correct approach in case >>> + * mmu reload fails. Currently, we just let the next >>> + * reload potentially fail >>> + */ >>> + kvm_mmu_reload(vcpu); >> >> So, what actually happens if this generates a tripple fault? I guess we >> will kill the (nested) hypervisor? > > Yes. Not sure what's the right thing to do is though... Wonder what happens on real hardware.
2017-08-01 13:40+0200, David Hildenbrand: > On 31.07.2017 21:32, Bandan Das wrote: > > David Hildenbrand <david@redhat.com> writes: > >>> + /* AD, if set, should be supported */ > >>> + if ((address & VMX_EPT_AD_ENABLE_BIT)) { > >>> + if (!enable_ept_ad_bits) > >>> + return false; > >>> + mmu->ept_ad = true; > >>> + } else > >>> + mmu->ept_ad = false; This block should also set the mmu->base_role.ad_disabled. (The idea being that things should be done as if the EPTP was set during a VM entry. The only notable difference is that we do not reload PDPTRS.) > >> I wouldn't expect a "check" function to modify the mmu. Can you move > >> modifying the mmu outside of this function (leaving the > >> enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad > >> _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?) > >> > > > > Well, the correct thing to do is have a wrapper around it in mmu.c > > without directly calling here and also call this function before > > nested_mmu is initialized. I am working on a separate patch for this btw. > > Sounds good. Also thought that encapsulating this might be a good option. Seconded. :) > >>> + if (vmcs12->ept_pointer != address) { > >>> + if (!check_ept_address_valid(vcpu, address)) { > >>> + kunmap(page); > >>> + nested_release_page_clean(page); > >>> + return 1; > >>> + } > >>> + kvm_mmu_unload(vcpu); > >>> + vmcs12->ept_pointer = address; > >>> + /* > >>> + * TODO: Check what's the correct approach in case > >>> + * mmu reload fails. Currently, we just let the next > >>> + * reload potentially fail > >>> + */ > >>> + kvm_mmu_reload(vcpu); > >> > >> So, what actually happens if this generates a tripple fault? I guess we > >> will kill the (nested) hypervisor? > > > > Yes. Not sure what's the right thing to do is though... Right, we even drop kvm_mmu_reload() here for now and let the one in vcpu_enter_guest() take care of the thing. > Wonder what happens on real hardware. This operation cannot fail or real hardware. All addresses within the physical address limit return something when read. On Intel, this is a value with all bits set (-1) and will cause an EPT misconfiguration VM exit on the next page walk (instruction decoding). Thanks.
2017-07-28 15:52-0400, Bandan Das: > When L2 uses vmfunc, L0 utilizes the associated vmexit to > emulate a switching of the ept pointer by reloading the > guest MMU. > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > Signed-off-by: Bandan Das <bsd@redhat.com> > --- > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) > return 1; > } > > +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + u64 mask = VMX_EPT_RWX_MASK; > + int maxphyaddr = cpuid_maxphyaddr(vcpu); > + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; > + > + /* Check for execute_only validity */ > + if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) { > + if (!(vmx->nested.nested_vmx_ept_caps & > + VMX_EPT_EXECUTE_ONLY_BIT)) > + return false; > + } This checks looks wrong ... bits 0:2 define the memory type: 0 = Uncacheable (UC) 6 = Write-back (WB) If those are supported MSR IA32_VMX_EPT_VPID_CAP, so I think it should return false when (address & 0x7) == 0 && !(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) the same for 6 and VMX_EPTP_WB_BIT and unconditionally for the remaining types. Btw. when is TLB flushed after EPTP switching? > @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > vmx->nested.nested_vmx_entry_ctls_high)) > return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > > - if (nested_cpu_has_vmfunc(vmcs12) && > - (vmcs12->vm_function_control & > - ~vmx->nested.nested_vmx_vmfunc_controls)) > - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > + if (nested_cpu_has_vmfunc(vmcs12)) { > + if (vmcs12->vm_function_control & > + ~vmx->nested.nested_vmx_vmfunc_controls) > + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > + > + if (nested_cpu_has_eptp_switching(vmcs12)) { > + if (!nested_cpu_has_ept(vmcs12) || > + (vmcs12->eptp_list_address >> > + cpuid_maxphyaddr(vcpu)) || > + !IS_ALIGNED(vmcs12->eptp_list_address, 4096)) page_address_valid() would make this check a bit nicer, thanks. > + return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-28 15:52-0400, Bandan Das: >> When L2 uses vmfunc, L0 utilizes the associated vmexit to >> emulate a switching of the ept pointer by reloading the >> guest MMU. >> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> >> Signed-off-by: Bandan Das <bsd@redhat.com> >> --- >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) >> return 1; >> } >> >> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + u64 mask = VMX_EPT_RWX_MASK; >> + int maxphyaddr = cpuid_maxphyaddr(vcpu); >> + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; >> + >> + /* Check for execute_only validity */ >> + if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) { >> + if (!(vmx->nested.nested_vmx_ept_caps & >> + VMX_EPT_EXECUTE_ONLY_BIT)) >> + return false; >> + } > > This checks looks wrong ... bits 0:2 define the memory type: > > 0 = Uncacheable (UC) > 6 = Write-back (WB) Oops, sorry, I badly messed this up! I will incorporate these changes and the suggestions by David to a new version. > If those are supported MSR IA32_VMX_EPT_VPID_CAP, so I think it should > return false when > > (address & 0x7) == 0 && !(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) > > the same for 6 and VMX_EPTP_WB_BIT and unconditionally for the remaining > types. > > Btw. when is TLB flushed after EPTP switching? From what I understand, mmu_sync_roots() calls kvm_mmu_flush_or_zap() that sets KVM_REQ_TLB_FLUSH. Bandan >> @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) >> vmx->nested.nested_vmx_entry_ctls_high)) >> return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> >> - if (nested_cpu_has_vmfunc(vmcs12) && >> - (vmcs12->vm_function_control & >> - ~vmx->nested.nested_vmx_vmfunc_controls)) >> - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> + if (nested_cpu_has_vmfunc(vmcs12)) { >> + if (vmcs12->vm_function_control & >> + ~vmx->nested.nested_vmx_vmfunc_controls) >> + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; >> + >> + if (nested_cpu_has_eptp_switching(vmcs12)) { >> + if (!nested_cpu_has_ept(vmcs12) || >> + (vmcs12->eptp_list_address >> >> + cpuid_maxphyaddr(vcpu)) || >> + !IS_ALIGNED(vmcs12->eptp_list_address, 4096)) > > page_address_valid() would make this check a bit nicer, > > thanks. > >> + return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index da5375e..5f63a2e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -115,6 +115,10 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +/* VMFUNC functions */ +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 +#define VMFUNC_EPTP_ENTRIES 512 + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); @@ -200,6 +204,8 @@ enum vmcs_field { EOI_EXIT_BITMAP2_HIGH = 0x00002021, EOI_EXIT_BITMAP3 = 0x00002022, EOI_EXIT_BITMAP3_HIGH = 0x00002023, + EPTP_LIST_ADDRESS = 0x00002024, + EPTP_LIST_ADDRESS_HIGH = 0x00002025, VMREAD_BITMAP = 0x00002026, VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fe8f5fc..f1ab783 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -246,6 +246,7 @@ struct __packed vmcs12 { u64 eoi_exit_bitmap1; u64 eoi_exit_bitmap2; u64 eoi_exit_bitmap3; + u64 eptp_list_address; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); } +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) +{ + return nested_cpu_has_vmfunc(vmcs12) && + (vmcs12->vm_function_control & + VMX_VMFUNC_EPTP_SWITCHING); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (cpu_has_vmx_vmfunc()) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VMFUNC; - vmx->nested.nested_vmx_vmfunc_controls = 0; + /* + * Advertise EPTP switching unconditionally + * since we emulate it + */ + vmx->nested.nested_vmx_vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; } /* @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) return 1; } +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 mask = VMX_EPT_RWX_MASK; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + /* Check for execute_only validity */ + if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) { + if (!(vmx->nested.nested_vmx_ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT)) + return false; + } + + /* Bits 5:3 must be 3 */ + if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW) + return false; + + /* Reserved bits should not be set */ + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) + return false; + + /* AD, if set, should be supported */ + if ((address & VMX_EPT_AD_ENABLE_BIT)) { + if (!enable_ept_ad_bits) + return false; + mmu->ept_ad = true; + } else + mmu->ept_ad = false; + + return true; +} + +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 *l1_eptp_list, address; + struct page *page; + + if (!nested_cpu_has_eptp_switching(vmcs12) || + !nested_cpu_has_ept(vmcs12)) + return 1; + + if (index >= VMFUNC_EPTP_ENTRIES) + return 1; + + page = nested_get_page(vcpu, vmcs12->eptp_list_address); + if (!page) + return 1; + + l1_eptp_list = kmap(page); + address = l1_eptp_list[index]; + + /* + * If the (L2) guest does a vmfunc to the currently + * active ept pointer, we don't have to do anything else + */ + if (vmcs12->ept_pointer != address) { + if (!check_ept_address_valid(vcpu, address)) { + kunmap(page); + nested_release_page_clean(page); + return 1; + } + kvm_mmu_unload(vcpu); + vmcs12->ept_pointer = address; + /* + * TODO: Check what's the correct approach in case + * mmu reload fails. Currently, we just let the next + * reload potentially fail + */ + kvm_mmu_reload(vcpu); + } + + kunmap(page); + nested_release_page_clean(page); + return 0; +} + static int handle_vmfunc(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7786,7 +7879,16 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) vmcs12 = get_vmcs12(vcpu); if ((vmcs12->vm_function_control & (1 << function)) == 0) goto fail; - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); + + switch (function) { + case 0: + if (nested_vmx_eptp_switching(vcpu, vmcs12)) + goto fail; + break; + default: + goto fail; + } + return kvm_skip_emulated_instruction(vcpu); fail: nested_vmx_vmexit(vcpu, vmx->exit_reason, @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.nested_vmx_entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_cpu_has_vmfunc(vmcs12) && - (vmcs12->vm_function_control & - ~vmx->nested.nested_vmx_vmfunc_controls)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vmfunc(vmcs12)) { + if (vmcs12->vm_function_control & + ~vmx->nested.nested_vmx_vmfunc_controls) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_eptp_switching(vmcs12)) { + if (!nested_cpu_has_ept(vmcs12) || + (vmcs12->eptp_list_address >> + cpuid_maxphyaddr(vcpu)) || + !IS_ALIGNED(vmcs12->eptp_list_address, 4096)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD;