[RFC] KVM: VMX: drop vmm_exclusive module parameter
diff mbox

Message ID 20170310114713.7571-1-david@redhat.com
State New
Headers show

Commit Message

David Hildenbrand March 10, 2017, 11:47 a.m. UTC
vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
called. This is obviously not the case if both are used independtly.
Calling VMXOFF without a previous VMXON will result in an exception.

In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
use by another VMM in hardware_enable(). So there can't really be
co-existance. If the other VMM is prepared for co-existance and does a
similar check, only one VMM can exist. If the other VMM is not prepared
and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
X86_CR4_VMXE.

As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
this seems to be pretty much untested. So let's better drop it.

While at it, directly move setting/clearing X86_CR4_VMXE into
kvm_cpu_vmxon/off.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/x86/kvm/vmx.c | 38 +++++++-------------------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

Comments

Radim Krčmář March 14, 2017, 8:30 p.m. UTC | #1
2017-03-10 12:47+0100, David Hildenbrand:
> vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
> VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
> indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
> called. This is obviously not the case if both are used independtly.
> Calling VMXOFF without a previous VMXON will result in an exception.
> 
> In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
> use by another VMM in hardware_enable(). So there can't really be
> co-existance. If the other VMM is prepared for co-existance and does a
> similar check, only one VMM can exist. If the other VMM is not prepared
> and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
> X86_CR4_VMXE.
> 
> As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
> this seems to be pretty much untested. So let's better drop it.

Totally,

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

> While at it, directly move setting/clearing X86_CR4_VMXE into
> kvm_cpu_vmxon/off.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 38 +++++++-------------------------------
>  1 file changed, 7 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 283aa86..bbbfe12 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
>  static bool __read_mostly emulate_invalid_guest_state = true;
>  module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>  
> -static bool __read_mostly vmm_exclusive = 1;
> -module_param(vmm_exclusive, bool, S_IRUGO);
> -
>  static bool __read_mostly fasteoi = 1;
>  module_param(fasteoi, bool, S_IRUGO);
>  
> @@ -914,8 +911,6 @@ static void nested_release_page_clean(struct page *page)
>  
>  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
>  static u64 construct_eptp(unsigned long root_hpa);
> -static void kvm_cpu_vmxon(u64 addr);
> -static void kvm_cpu_vmxoff(void);
>  static bool vmx_xsaves_supported(void);
>  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
>  static void vmx_set_segment(struct kvm_vcpu *vcpu,
> @@ -2235,15 +2230,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
>  static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
>  	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
>  
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxon(phys_addr);
> -	else if (!already_loaded)
> -		loaded_vmcs_clear(vmx->loaded_vmcs);
> -
>  	if (!already_loaded) {
> +		loaded_vmcs_clear(vmx->loaded_vmcs);
>  		local_irq_disable();
>  		crash_disable_local_vmclear(cpu);
>  
> @@ -2321,11 +2311,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
>  	vmx_vcpu_pi_put(vcpu);
>  
>  	__vmx_load_host_state(to_vmx(vcpu));
> -	if (!vmm_exclusive) {
> -		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
> -		vcpu->cpu = -1;
> -		kvm_cpu_vmxoff();
> -	}
>  }
>  
>  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
> @@ -3416,6 +3401,7 @@ static __init int vmx_disabled_by_bios(void)
>  
>  static void kvm_cpu_vmxon(u64 addr)
>  {
> +	cr4_set_bits(X86_CR4_VMXE);
>  	intel_pt_handle_vmx(1);
>  
>  	asm volatile (ASM_VMX_VMXON_RAX
> @@ -3458,12 +3444,8 @@ static int hardware_enable(void)
>  		/* enable and lock */
>  		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
>  	}
> -	cr4_set_bits(X86_CR4_VMXE);
> -
> -	if (vmm_exclusive) {
> -		kvm_cpu_vmxon(phys_addr);
> -		ept_sync_global();
> -	}
> +	kvm_cpu_vmxon(phys_addr);
> +	ept_sync_global();
>  
>  	native_store_gdt(this_cpu_ptr(&host_gdt));
>  
> @@ -3489,15 +3471,13 @@ static void kvm_cpu_vmxoff(void)
>  	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
>  
>  	intel_pt_handle_vmx(0);
> +	cr4_clear_bits(X86_CR4_VMXE);
>  }
>  
>  static void hardware_disable(void)
>  {
> -	if (vmm_exclusive) {
> -		vmclear_local_loaded_vmcss();
> -		kvm_cpu_vmxoff();
> -	}
> -	cr4_clear_bits(X86_CR4_VMXE);
> +	vmclear_local_loaded_vmcss();
> +	kvm_cpu_vmxoff();
>  }
>  
>  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
> @@ -9228,11 +9208,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  	vmx->loaded_vmcs->shadow_vmcs = NULL;
>  	if (!vmx->loaded_vmcs->vmcs)
>  		goto free_msrs;
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
>  	loaded_vmcs_init(vmx->loaded_vmcs);
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxoff();
>  
>  	cpu = get_cpu();
>  	vmx_vcpu_load(&vmx->vcpu, cpu);
> -- 
> 2.9.3
>
Paolo Bonzini April 18, 2017, 1:30 p.m. UTC | #2
On 14/03/2017 21:30, Radim Krčmář wrote:
> 2017-03-10 12:47+0100, David Hildenbrand:
>> vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
>> VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
>> indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
>> called. This is obviously not the case if both are used independtly.
>> Calling VMXOFF without a previous VMXON will result in an exception.
>>
>> In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
>> use by another VMM in hardware_enable(). So there can't really be
>> co-existance. If the other VMM is prepared for co-existance and does a
>> similar check, only one VMM can exist. If the other VMM is not prepared
>> and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
>> X86_CR4_VMXE.
>>
>> As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
>> this seems to be pretty much untested. So let's better drop it.
> 
> Totally,
> 
> Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

Radim,

are you putting this in kvm/queue for 4.12?

Thanks,

Paolo

>> While at it, directly move setting/clearing X86_CR4_VMXE into
>> kvm_cpu_vmxon/off.
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>  arch/x86/kvm/vmx.c | 38 +++++++-------------------------------
>>  1 file changed, 7 insertions(+), 31 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 283aa86..bbbfe12 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
>>  static bool __read_mostly emulate_invalid_guest_state = true;
>>  module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>>  
>> -static bool __read_mostly vmm_exclusive = 1;
>> -module_param(vmm_exclusive, bool, S_IRUGO);
>> -
>>  static bool __read_mostly fasteoi = 1;
>>  module_param(fasteoi, bool, S_IRUGO);
>>  
>> @@ -914,8 +911,6 @@ static void nested_release_page_clean(struct page *page)
>>  
>>  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
>>  static u64 construct_eptp(unsigned long root_hpa);
>> -static void kvm_cpu_vmxon(u64 addr);
>> -static void kvm_cpu_vmxoff(void);
>>  static bool vmx_xsaves_supported(void);
>>  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
>>  static void vmx_set_segment(struct kvm_vcpu *vcpu,
>> @@ -2235,15 +2230,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
>>  static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>  {
>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> -	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
>>  	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
>>  
>> -	if (!vmm_exclusive)
>> -		kvm_cpu_vmxon(phys_addr);
>> -	else if (!already_loaded)
>> -		loaded_vmcs_clear(vmx->loaded_vmcs);
>> -
>>  	if (!already_loaded) {
>> +		loaded_vmcs_clear(vmx->loaded_vmcs);
>>  		local_irq_disable();
>>  		crash_disable_local_vmclear(cpu);
>>  
>> @@ -2321,11 +2311,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
>>  	vmx_vcpu_pi_put(vcpu);
>>  
>>  	__vmx_load_host_state(to_vmx(vcpu));
>> -	if (!vmm_exclusive) {
>> -		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
>> -		vcpu->cpu = -1;
>> -		kvm_cpu_vmxoff();
>> -	}
>>  }
>>  
>>  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
>> @@ -3416,6 +3401,7 @@ static __init int vmx_disabled_by_bios(void)
>>  
>>  static void kvm_cpu_vmxon(u64 addr)
>>  {
>> +	cr4_set_bits(X86_CR4_VMXE);
>>  	intel_pt_handle_vmx(1);
>>  
>>  	asm volatile (ASM_VMX_VMXON_RAX
>> @@ -3458,12 +3444,8 @@ static int hardware_enable(void)
>>  		/* enable and lock */
>>  		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
>>  	}
>> -	cr4_set_bits(X86_CR4_VMXE);
>> -
>> -	if (vmm_exclusive) {
>> -		kvm_cpu_vmxon(phys_addr);
>> -		ept_sync_global();
>> -	}
>> +	kvm_cpu_vmxon(phys_addr);
>> +	ept_sync_global();
>>  
>>  	native_store_gdt(this_cpu_ptr(&host_gdt));
>>  
>> @@ -3489,15 +3471,13 @@ static void kvm_cpu_vmxoff(void)
>>  	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
>>  
>>  	intel_pt_handle_vmx(0);
>> +	cr4_clear_bits(X86_CR4_VMXE);
>>  }
>>  
>>  static void hardware_disable(void)
>>  {
>> -	if (vmm_exclusive) {
>> -		vmclear_local_loaded_vmcss();
>> -		kvm_cpu_vmxoff();
>> -	}
>> -	cr4_clear_bits(X86_CR4_VMXE);
>> +	vmclear_local_loaded_vmcss();
>> +	kvm_cpu_vmxoff();
>>  }
>>  
>>  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
>> @@ -9228,11 +9208,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>>  	vmx->loaded_vmcs->shadow_vmcs = NULL;
>>  	if (!vmx->loaded_vmcs->vmcs)
>>  		goto free_msrs;
>> -	if (!vmm_exclusive)
>> -		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
>>  	loaded_vmcs_init(vmx->loaded_vmcs);
>> -	if (!vmm_exclusive)
>> -		kvm_cpu_vmxoff();
>>  
>>  	cpu = get_cpu();
>>  	vmx_vcpu_load(&vmx->vcpu, cpu);
>> -- 
>> 2.9.3
>>
Paolo Bonzini April 21, 2017, 9:42 a.m. UTC | #3
On 10/03/2017 12:47, David Hildenbrand wrote:
> vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
> VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
> indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
> called. This is obviously not the case if both are used independtly.
> Calling VMXOFF without a previous VMXON will result in an exception.
> 
> In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
> use by another VMM in hardware_enable(). So there can't really be
> co-existance. If the other VMM is prepared for co-existance and does a
> similar check, only one VMM can exist. If the other VMM is not prepared
> and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
> X86_CR4_VMXE.
> 
> As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
> this seems to be pretty much untested. So let's better drop it.
> 
> While at it, directly move setting/clearing X86_CR4_VMXE into
> kvm_cpu_vmxon/off.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 38 +++++++-------------------------------
>  1 file changed, 7 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 283aa86..bbbfe12 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
>  static bool __read_mostly emulate_invalid_guest_state = true;
>  module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>  
> -static bool __read_mostly vmm_exclusive = 1;
> -module_param(vmm_exclusive, bool, S_IRUGO);
> -
>  static bool __read_mostly fasteoi = 1;
>  module_param(fasteoi, bool, S_IRUGO);
>  
> @@ -914,8 +911,6 @@ static void nested_release_page_clean(struct page *page)
>  
>  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
>  static u64 construct_eptp(unsigned long root_hpa);
> -static void kvm_cpu_vmxon(u64 addr);
> -static void kvm_cpu_vmxoff(void);
>  static bool vmx_xsaves_supported(void);
>  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
>  static void vmx_set_segment(struct kvm_vcpu *vcpu,
> @@ -2235,15 +2230,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
>  static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
>  	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
>  
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxon(phys_addr);
> -	else if (!already_loaded)
> -		loaded_vmcs_clear(vmx->loaded_vmcs);
> -
>  	if (!already_loaded) {
> +		loaded_vmcs_clear(vmx->loaded_vmcs);
>  		local_irq_disable();
>  		crash_disable_local_vmclear(cpu);
>  
> @@ -2321,11 +2311,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
>  	vmx_vcpu_pi_put(vcpu);
>  
>  	__vmx_load_host_state(to_vmx(vcpu));
> -	if (!vmm_exclusive) {
> -		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
> -		vcpu->cpu = -1;
> -		kvm_cpu_vmxoff();
> -	}
>  }
>  
>  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
> @@ -3416,6 +3401,7 @@ static __init int vmx_disabled_by_bios(void)
>  
>  static void kvm_cpu_vmxon(u64 addr)
>  {
> +	cr4_set_bits(X86_CR4_VMXE);
>  	intel_pt_handle_vmx(1);
>  
>  	asm volatile (ASM_VMX_VMXON_RAX
> @@ -3458,12 +3444,8 @@ static int hardware_enable(void)
>  		/* enable and lock */
>  		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
>  	}
> -	cr4_set_bits(X86_CR4_VMXE);
> -
> -	if (vmm_exclusive) {
> -		kvm_cpu_vmxon(phys_addr);
> -		ept_sync_global();
> -	}
> +	kvm_cpu_vmxon(phys_addr);
> +	ept_sync_global();
>  
>  	native_store_gdt(this_cpu_ptr(&host_gdt));
>  
> @@ -3489,15 +3471,13 @@ static void kvm_cpu_vmxoff(void)
>  	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
>  
>  	intel_pt_handle_vmx(0);
> +	cr4_clear_bits(X86_CR4_VMXE);
>  }
>  
>  static void hardware_disable(void)
>  {
> -	if (vmm_exclusive) {
> -		vmclear_local_loaded_vmcss();
> -		kvm_cpu_vmxoff();
> -	}
> -	cr4_clear_bits(X86_CR4_VMXE);
> +	vmclear_local_loaded_vmcss();
> +	kvm_cpu_vmxoff();
>  }
>  
>  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
> @@ -9228,11 +9208,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  	vmx->loaded_vmcs->shadow_vmcs = NULL;
>  	if (!vmx->loaded_vmcs->vmcs)
>  		goto free_msrs;
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
>  	loaded_vmcs_init(vmx->loaded_vmcs);
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxoff();
>  
>  	cpu = get_cpu();
>  	vmx_vcpu_load(&vmx->vcpu, cpu);
> 

Applied, thanks.

Paolo
Arnaldo Carvalho de Melo June 21, 2017, 5:48 p.m. UTC | #4
Em Fri, Mar 10, 2017 at 12:47:13PM +0100, David Hildenbrand escreveu:
> vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
> VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
> indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
> called. This is obviously not the case if both are used independtly.
> Calling VMXOFF without a previous VMXON will result in an exception.
> 
> In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
> use by another VMM in hardware_enable(). So there can't really be
> co-existance. If the other VMM is prepared for co-existance and does a
> similar check, only one VMM can exist. If the other VMM is not prepared
> and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
> X86_CR4_VMXE.
> 
> As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
> this seems to be pretty much untested. So let's better drop it.
> 
> While at it, directly move setting/clearing X86_CR4_VMXE into
> kvm_cpu_vmxon/off.

Oh well, I was using, as suggested by Alexander, this parameter to be
able to use Intel PT on the host on a Broadwell machine, i.e.:

  perf record -e intel_pt// usleep 1
  perf script

would show decoded Intel PT records, no more :-\ But I'm clueless about
KVM internals, so just reporting the change in behaviour for this very
specific use case.

Now I don't know if this is something that would make Intel PT be usable
on Broadwell machines but wouldn't be required with newer chips, will
test with a Kaby Lake i5 7500 when back at my home office...

- Arnaldo
Radim Krčmář June 21, 2017, 6:24 p.m. UTC | #5
2017-06-21 14:48-0300, Arnaldo Carvalho de Melo:
> Em Fri, Mar 10, 2017 at 12:47:13PM +0100, David Hildenbrand escreveu:
> > vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
> > VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
> > indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
> > called. This is obviously not the case if both are used independtly.
> > Calling VMXOFF without a previous VMXON will result in an exception.
> > 
> > In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
> > use by another VMM in hardware_enable(). So there can't really be
> > co-existance. If the other VMM is prepared for co-existance and does a
> > similar check, only one VMM can exist. If the other VMM is not prepared
> > and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
> > X86_CR4_VMXE.
> > 
> > As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
> > this seems to be pretty much untested. So let's better drop it.
> > 
> > While at it, directly move setting/clearing X86_CR4_VMXE into
> > kvm_cpu_vmxon/off.
> 
> Oh well, I was using, as suggested by Alexander, this parameter to be
> able to use Intel PT on the host on a Broadwell machine, i.e.:
> 
>   perf record -e intel_pt// usleep 1
>   perf script

We thought that blacklisting the KVM module was a good solution ...
Were you using KVM virtual machines with vmm_exclusive=0?

> would show decoded Intel PT records, no more :-\ But I'm clueless about
> KVM internals, so just reporting the change in behaviour for this very
> specific use case.
> 
> Now I don't know if this is something that would make Intel PT be usable
> on Broadwell machines but wouldn't be required with newer chips, will
> test with a Kaby Lake i5 7500 when back at my home office...

Most likely, SDM 35.2.8.2 says:

 Initial implementations of Intel Processor Trace do not support tracing
 in VMX operation. Such processors indicate this by returning 0 for
 IA32_VMX_MISC[bit 14].

so something akin to vmm_exclusive is about the only option there.

Please try if Kaby Lake is already an advanced implementation, because
we might need to disable PT when entering VMX non-root mode
(so the tracing packets are not be written into guest's memory, just
 like with PEBS).

Thanks.

Patch
diff mbox

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 283aa86..bbbfe12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,9 +84,6 @@  module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 static bool __read_mostly emulate_invalid_guest_state = true;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-static bool __read_mostly vmm_exclusive = 1;
-module_param(vmm_exclusive, bool, S_IRUGO);
-
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
@@ -914,8 +911,6 @@  static void nested_release_page_clean(struct page *page)
 
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
-static void kvm_cpu_vmxon(u64 addr);
-static void kvm_cpu_vmxoff(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -2235,15 +2230,10 @@  static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(phys_addr);
-	else if (!already_loaded)
-		loaded_vmcs_clear(vmx->loaded_vmcs);
-
 	if (!already_loaded) {
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 		local_irq_disable();
 		crash_disable_local_vmclear(cpu);
 
@@ -2321,11 +2311,6 @@  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 	vmx_vcpu_pi_put(vcpu);
 
 	__vmx_load_host_state(to_vmx(vcpu));
-	if (!vmm_exclusive) {
-		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
-		vcpu->cpu = -1;
-		kvm_cpu_vmxoff();
-	}
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
@@ -3416,6 +3401,7 @@  static __init int vmx_disabled_by_bios(void)
 
 static void kvm_cpu_vmxon(u64 addr)
 {
+	cr4_set_bits(X86_CR4_VMXE);
 	intel_pt_handle_vmx(1);
 
 	asm volatile (ASM_VMX_VMXON_RAX
@@ -3458,12 +3444,8 @@  static int hardware_enable(void)
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
-	cr4_set_bits(X86_CR4_VMXE);
-
-	if (vmm_exclusive) {
-		kvm_cpu_vmxon(phys_addr);
-		ept_sync_global();
-	}
+	kvm_cpu_vmxon(phys_addr);
+	ept_sync_global();
 
 	native_store_gdt(this_cpu_ptr(&host_gdt));
 
@@ -3489,15 +3471,13 @@  static void kvm_cpu_vmxoff(void)
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 
 	intel_pt_handle_vmx(0);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static void hardware_disable(void)
 {
-	if (vmm_exclusive) {
-		vmclear_local_loaded_vmcss();
-		kvm_cpu_vmxoff();
-	}
-	cr4_clear_bits(X86_CR4_VMXE);
+	vmclear_local_loaded_vmcss();
+	kvm_cpu_vmxoff();
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -9228,11 +9208,7 @@  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	vmx->loaded_vmcs->shadow_vmcs = NULL;
 	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
 	loaded_vmcs_init(vmx->loaded_vmcs);
-	if (!vmm_exclusive)
-		kvm_cpu_vmxoff();
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);