diff mbox

[v5,3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor

Message ID 20170728195245.1018-4-bsd@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bandan Das July 28, 2017, 7:52 p.m. UTC
When L2 uses vmfunc, L0 utilizes the associated vmexit to
emulate a switching of the ept pointer by reloading the
guest MMU.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Bandan Das <bsd@redhat.com>
---
 arch/x86/include/asm/vmx.h |   6 +++
 arch/x86/kvm/vmx.c         | 124 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 124 insertions(+), 6 deletions(-)

Comments

David Hildenbrand July 31, 2017, 11:59 a.m. UTC | #1
> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
> +{
> +	return nested_cpu_has_vmfunc(vmcs12) &&
> +		(vmcs12->vm_function_control &
> +		 VMX_VMFUNC_EPTP_SWITCHING);
> +}
> +
>  static inline bool is_nmi(u32 intr_info)
>  {
>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>  	if (cpu_has_vmx_vmfunc()) {
>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>  			SECONDARY_EXEC_ENABLE_VMFUNC;
> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
> +		/*
> +		 * Advertise EPTP switching unconditionally
> +		 * since we emulate it
> +		 */
> +		vmx->nested.nested_vmx_vmfunc_controls =
> +			VMX_VMFUNC_EPTP_SWITCHING;

Should this only be advertised, if enable_ept is set (if the guest also
sees/can use SECONDARY_EXEC_ENABLE_EPT)?

>  	}
>  
>  	/*
> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address)

check_..._valid -> valid_ept_address() ?

> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u64 mask = VMX_EPT_RWX_MASK;
> +	int maxphyaddr = cpuid_maxphyaddr(vcpu);
> +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
> +
> +	/* Check for execute_only validity */
> +	if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) {
> +		if (!(vmx->nested.nested_vmx_ept_caps &
> +		      VMX_EPT_EXECUTE_ONLY_BIT))
> +			return false;
> +	}
> +
> +	/* Bits 5:3 must be 3 */
> +	if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW)
> +		return false;
> +
> +	/* Reserved bits should not be set */
> +	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
> +		return false;
> +
> +	/* AD, if set, should be supported */
> +	if ((address & VMX_EPT_AD_ENABLE_BIT)) {
> +		if (!enable_ept_ad_bits)
> +			return false;
> +		mmu->ept_ad = true;
> +	} else
> +		mmu->ept_ad = false;

I wouldn't expect a "check" function to modify the mmu. Can you move
modifying the mmu outside of this function (leaving the
enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad
_after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?)

> +
> +	return true;
> +}
> +
> +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
> +				     struct vmcs12 *vmcs12)
> +{
> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
> +	u64 *l1_eptp_list, address;
> +	struct page *page;
> +
> +	if (!nested_cpu_has_eptp_switching(vmcs12) ||
> +	    !nested_cpu_has_ept(vmcs12))
> +		return 1;
> +
> +	if (index >= VMFUNC_EPTP_ENTRIES)
> +		return 1;
> +
> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
> +	if (!page)
> +		return 1;
> +
> +	l1_eptp_list = kmap(page);
> +	address = l1_eptp_list[index];
> +
> +	/*
> +	 * If the (L2) guest does a vmfunc to the currently
> +	 * active ept pointer, we don't have to do anything else
> +	 */
> +	if (vmcs12->ept_pointer != address) {
> +		if (!check_ept_address_valid(vcpu, address)) {
> +			kunmap(page);
> +			nested_release_page_clean(page);
> +			return 1;
> +		}
> +		kvm_mmu_unload(vcpu);
> +		vmcs12->ept_pointer = address;
> +		/*
> +		 * TODO: Check what's the correct approach in case
> +		 * mmu reload fails. Currently, we just let the next
> +		 * reload potentially fail
> +		 */
> +		kvm_mmu_reload(vcpu);

So, what actually happens if this generates a tripple fault? I guess we
will kill the (nested) hypervisor?

> +	}
> +
> +	kunmap(page);
> +	nested_release_page_clean(page);
> +	return 0;
> +}
> +
>  static int handle_vmfunc(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -7786,7 +7879,16 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>  	vmcs12 = get_vmcs12(vcpu);
>  	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>  		goto fail;
> -	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
> +
> +	switch (function) {
> +	case 0:
> +		if (nested_vmx_eptp_switching(vcpu, vmcs12))
> +			goto fail;
> +		break;
> +	default:
> +		goto fail;
> +	}
> +	return kvm_skip_emulated_instruction(vcpu);
>  
>  fail:
>  	nested_vmx_vmexit(vcpu, vmx->exit_reason,
> @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  				vmx->nested.nested_vmx_entry_ctls_high))
>  		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>  
> -	if (nested_cpu_has_vmfunc(vmcs12) &&
> -	    (vmcs12->vm_function_control &
> -	     ~vmx->nested.nested_vmx_vmfunc_controls))
> -		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
> +	if (nested_cpu_has_vmfunc(vmcs12)) {
> +		if (vmcs12->vm_function_control &
> +		    ~vmx->nested.nested_vmx_vmfunc_controls)
> +			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
> +
> +		if (nested_cpu_has_eptp_switching(vmcs12)) {
> +			if (!nested_cpu_has_ept(vmcs12) ||
> +			    (vmcs12->eptp_list_address >>
> +			     cpuid_maxphyaddr(vcpu)) ||
> +			    !IS_ALIGNED(vmcs12->eptp_list_address, 4096))
> +				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
> +		}
> +	}
> +
>  
>  	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
>  		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>
Bandan Das July 31, 2017, 7:32 p.m. UTC | #2
Hi David,

David Hildenbrand <david@redhat.com> writes:

>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>> +		 VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>>  static inline bool is_nmi(u32 intr_info)
>>  {
>>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>  	if (cpu_has_vmx_vmfunc()) {
>>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>>  			SECONDARY_EXEC_ENABLE_VMFUNC;
>> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
>> +		/*
>> +		 * Advertise EPTP switching unconditionally
>> +		 * since we emulate it
>> +		 */
>> +		vmx->nested.nested_vmx_vmfunc_controls =
>> +			VMX_VMFUNC_EPTP_SWITCHING;
>
> Should this only be advertised, if enable_ept is set (if the guest also
> sees/can use SECONDARY_EXEC_ENABLE_EPT)?

This represents the function control MSR, which on the hardware is
a RO value. The checks for enable_ept and such are somewhere else.

>>  	}
>>  
>>  	/*
>> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
>>  	return 1;
>>  }
>>  
>> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address)
>
> check_..._valid -> valid_ept_address() ?

I think either of the names is fine and I would prefer not
to respin unless you feel really strongly about it :) 

>
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	u64 mask = VMX_EPT_RWX_MASK;
>> +	int maxphyaddr = cpuid_maxphyaddr(vcpu);
>> +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
>> +
>> +	/* Check for execute_only validity */
>> +	if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) {
>> +		if (!(vmx->nested.nested_vmx_ept_caps &
>> +		      VMX_EPT_EXECUTE_ONLY_BIT))
>> +			return false;
>> +	}
>> +
>> +	/* Bits 5:3 must be 3 */
>> +	if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW)
>> +		return false;
>> +
>> +	/* Reserved bits should not be set */
>> +	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
>> +		return false;
>> +
>> +	/* AD, if set, should be supported */
>> +	if ((address & VMX_EPT_AD_ENABLE_BIT)) {
>> +		if (!enable_ept_ad_bits)
>> +			return false;
>> +		mmu->ept_ad = true;
>> +	} else
>> +		mmu->ept_ad = false;
>
> I wouldn't expect a "check" function to modify the mmu. Can you move
> modifying the mmu outside of this function (leaving the
> enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad
> _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?)
>

Well, the correct thing to do is have a wrapper around it in mmu.c
without directly calling here and also call this function before
nested_mmu is initialized. I am working on a separate patch for this btw.
It seems to me setting mmu->ept_ad after kvm_mmu_unload is unnecessary
since it's already being set only if everything else succeeds.
kvm_mmu_unload() isn't affected by the setting of this flag if I understand
correctly.

>> +
>> +	return true;
>> +}
>> +
>> +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
>> +				     struct vmcs12 *vmcs12)
>> +{
>> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> +	u64 *l1_eptp_list, address;
>> +	struct page *page;
>> +
>> +	if (!nested_cpu_has_eptp_switching(vmcs12) ||
>> +	    !nested_cpu_has_ept(vmcs12))
>> +		return 1;
>> +
>> +	if (index >= VMFUNC_EPTP_ENTRIES)
>> +		return 1;
>> +
>> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> +	if (!page)
>> +		return 1;
>> +
>> +	l1_eptp_list = kmap(page);
>> +	address = l1_eptp_list[index];
>> +
>> +	/*
>> +	 * If the (L2) guest does a vmfunc to the currently
>> +	 * active ept pointer, we don't have to do anything else
>> +	 */
>> +	if (vmcs12->ept_pointer != address) {
>> +		if (!check_ept_address_valid(vcpu, address)) {
>> +			kunmap(page);
>> +			nested_release_page_clean(page);
>> +			return 1;
>> +		}
>> +		kvm_mmu_unload(vcpu);
>> +		vmcs12->ept_pointer = address;
>> +		/*
>> +		 * TODO: Check what's the correct approach in case
>> +		 * mmu reload fails. Currently, we just let the next
>> +		 * reload potentially fail
>> +		 */
>> +		kvm_mmu_reload(vcpu);
>
> So, what actually happens if this generates a tripple fault? I guess we
> will kill the (nested) hypervisor?

Yes. Not sure what's the right thing to do is though...

Bandan

>> +	}
>> +
>> +	kunmap(page);
>> +	nested_release_page_clean(page);
>> +	return 0;
>> +}
>> +
>>  static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>  {
>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> @@ -7786,7 +7879,16 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>  	vmcs12 = get_vmcs12(vcpu);
>>  	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>>  		goto fail;
>> -	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>> +
>> +	switch (function) {
>> +	case 0:
>> +		if (nested_vmx_eptp_switching(vcpu, vmcs12))
>> +			goto fail;
>> +		break;
>> +	default:
>> +		goto fail;
>> +	}
>> +	return kvm_skip_emulated_instruction(vcpu);
>>  
>>  fail:
>>  	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>> @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  				vmx->nested.nested_vmx_entry_ctls_high))
>>  		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>>  
>> -	if (nested_cpu_has_vmfunc(vmcs12) &&
>> -	    (vmcs12->vm_function_control &
>> -	     ~vmx->nested.nested_vmx_vmfunc_controls))
>> -		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>> +	if (nested_cpu_has_vmfunc(vmcs12)) {
>> +		if (vmcs12->vm_function_control &
>> +		    ~vmx->nested.nested_vmx_vmfunc_controls)
>> +			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>> +
>> +		if (nested_cpu_has_eptp_switching(vmcs12)) {
>> +			if (!nested_cpu_has_ept(vmcs12) ||
>> +			    (vmcs12->eptp_list_address >>
>> +			     cpuid_maxphyaddr(vcpu)) ||
>> +			    !IS_ALIGNED(vmcs12->eptp_list_address, 4096))
>> +				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>> +		}
>> +	}
>> +
>>  
>>  	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
>>  		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>>
David Hildenbrand Aug. 1, 2017, 11:40 a.m. UTC | #3
On 31.07.2017 21:32, Bandan Das wrote:
> Hi David,
> 
> David Hildenbrand <david@redhat.com> writes:
> 
>>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>>> +{
>>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>>> +		(vmcs12->vm_function_control &
>>> +		 VMX_VMFUNC_EPTP_SWITCHING);
>>> +}
>>> +
>>>  static inline bool is_nmi(u32 intr_info)
>>>  {
>>>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>>  	if (cpu_has_vmx_vmfunc()) {
>>>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>>>  			SECONDARY_EXEC_ENABLE_VMFUNC;
>>> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
>>> +		/*
>>> +		 * Advertise EPTP switching unconditionally
>>> +		 * since we emulate it
>>> +		 */
>>> +		vmx->nested.nested_vmx_vmfunc_controls =
>>> +			VMX_VMFUNC_EPTP_SWITCHING;
>>
>> Should this only be advertised, if enable_ept is set (if the guest also
>> sees/can use SECONDARY_EXEC_ENABLE_EPT)?
> 
> This represents the function control MSR, which on the hardware is
> a RO value. The checks for enable_ept and such are somewhere else.

This is the

if (!nested_cpu_has_eptp_switching(vmcs12) ||
    !nested_cpu_has_ept(vmcs12))
	return 1;

check then, I assume. Makes sense.

> 
>>>  	}
>>>  
>>>  	/*
>>> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
>>>  	return 1;
>>>  }
>>>  
>>> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address)
>>
>> check_..._valid -> valid_ept_address() ?
> 
> I think either of the names is fine and I would prefer not
> to respin unless you feel really strongly about it :)

Sure, if you have to respin, you can fix this up.

> 
>>
>>> +{
>>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>> +	u64 mask = VMX_EPT_RWX_MASK;
>>> +	int maxphyaddr = cpuid_maxphyaddr(vcpu);
>>> +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
>>> +
>>> +	/* Check for execute_only validity */
>>> +	if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) {
>>> +		if (!(vmx->nested.nested_vmx_ept_caps &
>>> +		      VMX_EPT_EXECUTE_ONLY_BIT))
>>> +			return false;
>>> +	}
>>> +
>>> +	/* Bits 5:3 must be 3 */
>>> +	if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW)
>>> +		return false;
>>> +
>>> +	/* Reserved bits should not be set */
>>> +	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
>>> +		return false;
>>> +
>>> +	/* AD, if set, should be supported */
>>> +	if ((address & VMX_EPT_AD_ENABLE_BIT)) {
>>> +		if (!enable_ept_ad_bits)
>>> +			return false;
>>> +		mmu->ept_ad = true;
>>> +	} else
>>> +		mmu->ept_ad = false;
>>
>> I wouldn't expect a "check" function to modify the mmu. Can you move
>> modifying the mmu outside of this function (leaving the
>> enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad
>> _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?)
>>
> 
> Well, the correct thing to do is have a wrapper around it in mmu.c
> without directly calling here and also call this function before
> nested_mmu is initialized. I am working on a separate patch for this btw.

Sounds good. Also thought that encapsulating this might be a good option.

> It seems to me setting mmu->ept_ad after kvm_mmu_unload is unnecessary
> since it's already being set only if everything else succeeds.
> kvm_mmu_unload() isn't affected by the setting of this flag if I understand
> correctly.

It looks at least cleaner to set everything up after the unload has
happened (and could avoid future bugs, if that unload would every rely
on that setting!). + we can reuse that function more easily (e.g. vm entry).

But that's just my personal opinion. Feel free to ignore.

> 
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
>>> +				     struct vmcs12 *vmcs12)
>>> +{
>>> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>>> +	u64 *l1_eptp_list, address;
>>> +	struct page *page;
>>> +
>>> +	if (!nested_cpu_has_eptp_switching(vmcs12) ||
>>> +	    !nested_cpu_has_ept(vmcs12))
>>> +		return 1;
>>> +
>>> +	if (index >= VMFUNC_EPTP_ENTRIES)
>>> +		return 1;
>>> +
>>> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>>> +	if (!page)
>>> +		return 1;
>>> +
>>> +	l1_eptp_list = kmap(page);
>>> +	address = l1_eptp_list[index];
>>> +
>>> +	/*
>>> +	 * If the (L2) guest does a vmfunc to the currently
>>> +	 * active ept pointer, we don't have to do anything else
>>> +	 */
>>> +	if (vmcs12->ept_pointer != address) {
>>> +		if (!check_ept_address_valid(vcpu, address)) {
>>> +			kunmap(page);
>>> +			nested_release_page_clean(page);
>>> +			return 1;
>>> +		}
>>> +		kvm_mmu_unload(vcpu);
>>> +		vmcs12->ept_pointer = address;
>>> +		/*
>>> +		 * TODO: Check what's the correct approach in case
>>> +		 * mmu reload fails. Currently, we just let the next
>>> +		 * reload potentially fail
>>> +		 */
>>> +		kvm_mmu_reload(vcpu);
>>
>> So, what actually happens if this generates a tripple fault? I guess we
>> will kill the (nested) hypervisor?
> 
> Yes. Not sure what's the right thing to do is though...

Wonder what happens on real hardware.
Radim Krčmář Aug. 1, 2017, 2:55 p.m. UTC | #4
2017-08-01 13:40+0200, David Hildenbrand:
> On 31.07.2017 21:32, Bandan Das wrote:
> > David Hildenbrand <david@redhat.com> writes:
> >>> +	/* AD, if set, should be supported */
> >>> +	if ((address & VMX_EPT_AD_ENABLE_BIT)) {
> >>> +		if (!enable_ept_ad_bits)
> >>> +			return false;
> >>> +		mmu->ept_ad = true;
> >>> +	} else
> >>> +		mmu->ept_ad = false;

This block should also set the mmu->base_role.ad_disabled.

(The idea being that things should be done as if the EPTP was set during
 a VM entry.  The only notable difference is that we do not reload
 PDPTRS.)

> >> I wouldn't expect a "check" function to modify the mmu. Can you move
> >> modifying the mmu outside of this function (leaving the
> >> enable_ept_ad_bits check in place)? (and maybe even set mmu->ept_ad
> >> _after_ the kvm_mmu_unload(vcpu)?, just when setting vmcs12->ept_pointer?)
> >>
> > 
> > Well, the correct thing to do is have a wrapper around it in mmu.c
> > without directly calling here and also call this function before
> > nested_mmu is initialized. I am working on a separate patch for this btw.
> 
> Sounds good. Also thought that encapsulating this might be a good option.

Seconded. :)

> >>> +	if (vmcs12->ept_pointer != address) {
> >>> +		if (!check_ept_address_valid(vcpu, address)) {
> >>> +			kunmap(page);
> >>> +			nested_release_page_clean(page);
> >>> +			return 1;
> >>> +		}
> >>> +		kvm_mmu_unload(vcpu);
> >>> +		vmcs12->ept_pointer = address;
> >>> +		/*
> >>> +		 * TODO: Check what's the correct approach in case
> >>> +		 * mmu reload fails. Currently, we just let the next
> >>> +		 * reload potentially fail
> >>> +		 */
> >>> +		kvm_mmu_reload(vcpu);
> >>
> >> So, what actually happens if this generates a tripple fault? I guess we
> >> will kill the (nested) hypervisor?
> > 
> > Yes. Not sure what's the right thing to do is though...

Right, we even drop kvm_mmu_reload() here for now and let the one in
vcpu_enter_guest() take care of the thing.

> Wonder what happens on real hardware.

This operation cannot fail or real hardware.  All addresses within the
physical address limit return something when read.  On Intel, this is a
value with all bits set (-1) and will cause an EPT misconfiguration VM
exit on the next page walk (instruction decoding).

Thanks.
Radim Krčmář Aug. 1, 2017, 3:17 p.m. UTC | #5
2017-07-28 15:52-0400, Bandan Das:
> When L2 uses vmfunc, L0 utilizes the associated vmexit to
> emulate a switching of the ept pointer by reloading the
> guest MMU.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> ---
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u64 mask = VMX_EPT_RWX_MASK;
> +	int maxphyaddr = cpuid_maxphyaddr(vcpu);
> +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
> +
> +	/* Check for execute_only validity */
> +	if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) {
> +		if (!(vmx->nested.nested_vmx_ept_caps &
> +		      VMX_EPT_EXECUTE_ONLY_BIT))
> +			return false;
> +	}

This checks looks wrong ... bits 0:2 define the memory type:

  0 = Uncacheable (UC)
  6 = Write-back (WB)

If those are supported MSR IA32_VMX_EPT_VPID_CAP, so I think it should
return false when

  (address & 0x7) == 0 && !(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))

the same for 6 and VMX_EPTP_WB_BIT and unconditionally for the remaining
types.

Btw. when is TLB flushed after EPTP switching?

> @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  				vmx->nested.nested_vmx_entry_ctls_high))
>  		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>  
> -	if (nested_cpu_has_vmfunc(vmcs12) &&
> -	    (vmcs12->vm_function_control &
> -	     ~vmx->nested.nested_vmx_vmfunc_controls))
> -		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
> +	if (nested_cpu_has_vmfunc(vmcs12)) {
> +		if (vmcs12->vm_function_control &
> +		    ~vmx->nested.nested_vmx_vmfunc_controls)
> +			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
> +
> +		if (nested_cpu_has_eptp_switching(vmcs12)) {
> +			if (!nested_cpu_has_ept(vmcs12) ||
> +			    (vmcs12->eptp_list_address >>
> +			     cpuid_maxphyaddr(vcpu)) ||
> +			    !IS_ALIGNED(vmcs12->eptp_list_address, 4096))

page_address_valid() would make this check a bit nicer,

thanks.

> +				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
Bandan Das Aug. 1, 2017, 6:30 p.m. UTC | #6
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-28 15:52-0400, Bandan Das:
>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> emulate a switching of the ept pointer by reloading the
>> guest MMU.
>> 
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> ---
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> @@ -7767,6 +7781,85 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
>>  	return 1;
>>  }
>>  
>> +static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	u64 mask = VMX_EPT_RWX_MASK;
>> +	int maxphyaddr = cpuid_maxphyaddr(vcpu);
>> +	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
>> +
>> +	/* Check for execute_only validity */
>> +	if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) {
>> +		if (!(vmx->nested.nested_vmx_ept_caps &
>> +		      VMX_EPT_EXECUTE_ONLY_BIT))
>> +			return false;
>> +	}
>
> This checks looks wrong ... bits 0:2 define the memory type:
>
>   0 = Uncacheable (UC)
>   6 = Write-back (WB)

Oops, sorry, I badly messed this up! I will incorporate these
changes and the suggestions by David to a new version.

> If those are supported MSR IA32_VMX_EPT_VPID_CAP, so I think it should
> return false when
>
>   (address & 0x7) == 0 && !(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
>
> the same for 6 and VMX_EPTP_WB_BIT and unconditionally for the remaining
> types.
>
> Btw. when is TLB flushed after EPTP switching?

From what I understand, mmu_sync_roots() calls kvm_mmu_flush_or_zap()
that sets KVM_REQ_TLB_FLUSH.

Bandan

>> @@ -10354,10 +10456,20 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  				vmx->nested.nested_vmx_entry_ctls_high))
>>  		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>>  
>> -	if (nested_cpu_has_vmfunc(vmcs12) &&
>> -	    (vmcs12->vm_function_control &
>> -	     ~vmx->nested.nested_vmx_vmfunc_controls))
>> -		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>> +	if (nested_cpu_has_vmfunc(vmcs12)) {
>> +		if (vmcs12->vm_function_control &
>> +		    ~vmx->nested.nested_vmx_vmfunc_controls)
>> +			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>> +
>> +		if (nested_cpu_has_eptp_switching(vmcs12)) {
>> +			if (!nested_cpu_has_ept(vmcs12) ||
>> +			    (vmcs12->eptp_list_address >>
>> +			     cpuid_maxphyaddr(vcpu)) ||
>> +			    !IS_ALIGNED(vmcs12->eptp_list_address, 4096))
>
> page_address_valid() would make this check a bit nicer,
>
> thanks.
>
>> +				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
diff mbox

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da5375e..5f63a2e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -115,6 +115,10 @@ 
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
+#define VMFUNC_EPTP_ENTRIES  512
+
 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
 {
 	return vmx_basic & GENMASK_ULL(30, 0);
@@ -200,6 +204,8 @@  enum vmcs_field {
 	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
 	EOI_EXIT_BITMAP3                = 0x00002022,
 	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+	EPTP_LIST_ADDRESS               = 0x00002024,
+	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
 	VMREAD_BITMAP                   = 0x00002026,
 	VMWRITE_BITMAP                  = 0x00002028,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe8f5fc..f1ab783 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -246,6 +246,7 @@  struct __packed vmcs12 {
 	u64 eoi_exit_bitmap1;
 	u64 eoi_exit_bitmap2;
 	u64 eoi_exit_bitmap3;
+	u64 eptp_list_address;
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
@@ -771,6 +772,7 @@  static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
 	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
 	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1402,6 +1404,13 @@  static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
 }
 
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has_vmfunc(vmcs12) &&
+		(vmcs12->vm_function_control &
+		 VMX_VMFUNC_EPTP_SWITCHING);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2791,7 +2800,12 @@  static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	if (cpu_has_vmx_vmfunc()) {
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_ENABLE_VMFUNC;
-		vmx->nested.nested_vmx_vmfunc_controls = 0;
+		/*
+		 * Advertise EPTP switching unconditionally
+		 * since we emulate it
+		 */
+		vmx->nested.nested_vmx_vmfunc_controls =
+			VMX_VMFUNC_EPTP_SWITCHING;
 	}
 
 	/*
@@ -7767,6 +7781,85 @@  static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static bool check_ept_address_valid(struct kvm_vcpu *vcpu, u64 address)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 mask = VMX_EPT_RWX_MASK;
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+	/* Check for execute_only validity */
+	if ((address & mask) == VMX_EPT_EXECUTABLE_MASK) {
+		if (!(vmx->nested.nested_vmx_ept_caps &
+		      VMX_EPT_EXECUTE_ONLY_BIT))
+			return false;
+	}
+
+	/* Bits 5:3 must be 3 */
+	if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW)
+		return false;
+
+	/* Reserved bits should not be set */
+	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
+		return false;
+
+	/* AD, if set, should be supported */
+	if ((address & VMX_EPT_AD_ENABLE_BIT)) {
+		if (!enable_ept_ad_bits)
+			return false;
+		mmu->ept_ad = true;
+	} else
+		mmu->ept_ad = false;
+
+	return true;
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+				     struct vmcs12 *vmcs12)
+{
+	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+	u64 *l1_eptp_list, address;
+	struct page *page;
+
+	if (!nested_cpu_has_eptp_switching(vmcs12) ||
+	    !nested_cpu_has_ept(vmcs12))
+		return 1;
+
+	if (index >= VMFUNC_EPTP_ENTRIES)
+		return 1;
+
+	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
+	if (!page)
+		return 1;
+
+	l1_eptp_list = kmap(page);
+	address = l1_eptp_list[index];
+
+	/*
+	 * If the (L2) guest does a vmfunc to the currently
+	 * active ept pointer, we don't have to do anything else
+	 */
+	if (vmcs12->ept_pointer != address) {
+		if (!check_ept_address_valid(vcpu, address)) {
+			kunmap(page);
+			nested_release_page_clean(page);
+			return 1;
+		}
+		kvm_mmu_unload(vcpu);
+		vmcs12->ept_pointer = address;
+		/*
+		 * TODO: Check what's the correct approach in case
+		 * mmu reload fails. Currently, we just let the next
+		 * reload potentially fail
+		 */
+		kvm_mmu_reload(vcpu);
+	}
+
+	kunmap(page);
+	nested_release_page_clean(page);
+	return 0;
+}
+
 static int handle_vmfunc(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7786,7 +7879,16 @@  static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	vmcs12 = get_vmcs12(vcpu);
 	if ((vmcs12->vm_function_control & (1 << function)) == 0)
 		goto fail;
-	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
+
+	switch (function) {
+	case 0:
+		if (nested_vmx_eptp_switching(vcpu, vmcs12))
+			goto fail;
+		break;
+	default:
+		goto fail;
+	}
+	return kvm_skip_emulated_instruction(vcpu);
 
 fail:
 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
@@ -10354,10 +10456,20 @@  static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmx->nested.nested_vmx_entry_ctls_high))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_cpu_has_vmfunc(vmcs12) &&
-	    (vmcs12->vm_function_control &
-	     ~vmx->nested.nested_vmx_vmfunc_controls))
-		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+	if (nested_cpu_has_vmfunc(vmcs12)) {
+		if (vmcs12->vm_function_control &
+		    ~vmx->nested.nested_vmx_vmfunc_controls)
+			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+		if (nested_cpu_has_eptp_switching(vmcs12)) {
+			if (!nested_cpu_has_ept(vmcs12) ||
+			    (vmcs12->eptp_list_address >>
+			     cpuid_maxphyaddr(vcpu)) ||
+			    !IS_ALIGNED(vmcs12->eptp_list_address, 4096))
+				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+		}
+	}
+
 
 	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;