diff mbox

[v2,3/3] KVM: VMX: enable guest access to LMCE related MSRs

Message ID 20160616060531.30028-4-haozhong.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang June 16, 2016, 6:05 a.m. UTC
From: Ashok Raj <ashok.raj@intel.com>

On Intel platforms, this patch adds LMCE to KVM MCE supported
capabilities and handles guest access to LMCE related MSRs.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
[Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
           Only enable LMCE on Intel platform
	   Check MSR_IA32_FEATURE_CONTROL when handling guest
	     access to MSR_IA32_MCG_EXT_CTL]
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++++
 arch/x86/kvm/vmx.c              | 36 +++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              | 15 +++++++++------
 3 files changed, 49 insertions(+), 7 deletions(-)

Comments

Paolo Bonzini June 16, 2016, 10:04 a.m. UTC | #1
On 16/06/2016 08:05, Haozhong Zhang wrote:
> From: Ashok Raj <ashok.raj@intel.com>
> 
> On Intel platforms, this patch adds LMCE to KVM MCE supported
> capabilities and handles guest access to LMCE related MSRs.
> 
> Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
>            Only enable LMCE on Intel platform
> 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> 	     access to MSR_IA32_MCG_EXT_CTL]
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 +++++
>  arch/x86/kvm/vmx.c              | 36 +++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              | 15 +++++++++------
>  3 files changed, 49 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e0fbe7e..75defa6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -598,6 +598,7 @@ struct kvm_vcpu_arch {
>  	u64 mcg_cap;
>  	u64 mcg_status;
>  	u64 mcg_ctl;
> +	u64 mcg_ext_ctl;
>  	u64 *mce_banks;
>  
>  	/* Cache MMIO info */
> @@ -1005,6 +1006,8 @@ struct kvm_x86_ops {
>  	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
>  			      uint32_t guest_irq, bool set);
>  	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
> +
> +	void (*setup_mce)(struct kvm_vcpu *vcpu);
>  };
>  
>  struct kvm_arch_async_pf {
> @@ -1077,6 +1080,8 @@ extern u8   kvm_tsc_scaling_ratio_frac_bits;
>  /* maximum allowed value of TSC scaling ratio */
>  extern u64  kvm_max_tsc_scaling_ratio;
>  
> +extern u64 kvm_mce_cap_supported;
> +
>  enum emulation_result {
>  	EMULATE_DONE,         /* no further processing */
>  	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 1dc89c5..42db42e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -638,7 +638,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
>   * feature_control_valid_bits_add/del(), so it's not included here.
>   */
>  #define FEATURE_CONTROL_MAX_VALID_BITS \
> -	FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX
> +	(FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE)
>  
>  static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits)
>  {
> @@ -2905,6 +2905,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>  	return valid_bits && !(val & ~valid_bits);
>  }
>  
> +static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu,
> +						bool host_initiated)
> +{
> +	return (vcpu->arch.mcg_cap & MCG_LMCE_P) &&

Checking MCG_LMCE_P is unnecessary, because you cannot set
FEATURE_CONTROL_LMCE unless MCG_LMCE_P is present.

You can just inline this function in the callers, it's simpler.

> +		(host_initiated ||
> +		 (to_vmx(vcpu)->msr_ia32_feature_control &
> +		  FEATURE_CONTROL_LMCE));
> +}
> +
>  /*
>   * Reads an msr value (of 'msr_index') into 'pdata'.
>   * Returns 0 on success, non-0 otherwise.
> @@ -2946,6 +2955,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
>  		break;
> +	case MSR_IA32_MCG_EXT_CTL:
> +		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
> +						  msr_info->host_initiated))
> +			return 1;
> +		msr_info->data = vcpu->arch.mcg_ext_ctl;
> +		break;
>  	case MSR_IA32_FEATURE_CONTROL:
>  		if (!vmx_feature_control_msr_valid(vcpu, 0))
>  			return 1;
> @@ -3039,6 +3054,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	case MSR_IA32_TSC_ADJUST:
>  		ret = kvm_set_msr_common(vcpu, msr_info);
>  		break;
> +	case MSR_IA32_MCG_EXT_CTL:
> +		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
> +						  msr_info->host_initiated) ||
> +		    (data & ~MCG_EXT_CTL_LMCE_EN))
> +			return 1;
> +		vcpu->arch.mcg_ext_ctl = data;
> +		break;
>  	case MSR_IA32_FEATURE_CONTROL:
>  		if (!vmx_feature_control_msr_valid(vcpu, data) ||
>  		    (to_vmx(vcpu)->msr_ia32_feature_control &
> @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
>  
>  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
>  
> +	kvm_mce_cap_supported |= MCG_LMCE_P;

Ah, so virtual LMCE is available on all processors!  This is
interesting, but it also makes it more complicated to handle in QEMU; a
new QEMU generally doesn't require a new kernel.

Eduardo, any ideas?

Thanks,

Paolo

>  	return alloc_kvm_area();
>  
>  out8:
> @@ -10950,6 +10974,14 @@ out:
>  	return ret;
>  }
>  
> +static void vmx_setup_mce(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
> +		feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE);
> +	else
> +		feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE);
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -11074,6 +11106,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.pmu_ops = &intel_pmu_ops,
>  
>  	.update_pi_irte = vmx_update_pi_irte,
> +
> +	.setup_mce = vmx_setup_mce,
>  };
>  
>  static int __init vmx_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index bf22721..5bf76ab 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -70,7 +70,8 @@
>  
>  #define MAX_IO_MSRS 256
>  #define KVM_MAX_MCE_BANKS 32
> -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
> +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
>  
>  #define emul_to_vcpu(ctxt) \
>  	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
> @@ -983,6 +984,7 @@ static u32 emulated_msrs[] = {
>  	MSR_IA32_MISC_ENABLE,
>  	MSR_IA32_MCG_STATUS,
>  	MSR_IA32_MCG_CTL,
> +	MSR_IA32_MCG_EXT_CTL,
>  	MSR_IA32_SMBASE,
>  };
>  
> @@ -2684,11 +2686,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
>  		break;
>  	}
>  	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
> -		u64 mce_cap;
> -
> -		mce_cap = KVM_MCE_CAP_SUPPORTED;
>  		r = -EFAULT;
> -		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
> +		if (copy_to_user(argp, &kvm_mce_cap_supported,
> +				 sizeof(kvm_mce_cap_supported)))
>  			goto out;
>  		r = 0;
>  		break;
> @@ -2866,7 +2866,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>  	r = -EINVAL;
>  	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
>  		goto out;
> -	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
> +	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
>  		goto out;
>  	r = 0;
>  	vcpu->arch.mcg_cap = mcg_cap;
> @@ -2876,6 +2876,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>  	/* Init IA32_MCi_CTL to all 1s */
>  	for (bank = 0; bank < bank_num; bank++)
>  		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
> +
> +	if (kvm_x86_ops->setup_mce)
> +		kvm_x86_ops->setup_mce(vcpu);
>  out:
>  	return r;
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haozhong Zhang June 16, 2016, 10:49 a.m. UTC | #2
On 06/16/16 12:04, Paolo Bonzini wrote:
> 
> 
> On 16/06/2016 08:05, Haozhong Zhang wrote:
> > From: Ashok Raj <ashok.raj@intel.com>
> > 
> > On Intel platforms, this patch adds LMCE to KVM MCE supported
> > capabilities and handles guest access to LMCE related MSRs.
> > 
> > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
> >            Only enable LMCE on Intel platform
> > 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> > 	     access to MSR_IA32_MCG_EXT_CTL]
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  5 +++++
> >  arch/x86/kvm/vmx.c              | 36 +++++++++++++++++++++++++++++++++++-
> >  arch/x86/kvm/x86.c              | 15 +++++++++------
> >  3 files changed, 49 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index e0fbe7e..75defa6 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -598,6 +598,7 @@ struct kvm_vcpu_arch {
> >  	u64 mcg_cap;
> >  	u64 mcg_status;
> >  	u64 mcg_ctl;
> > +	u64 mcg_ext_ctl;
> >  	u64 *mce_banks;
> >  
> >  	/* Cache MMIO info */
> > @@ -1005,6 +1006,8 @@ struct kvm_x86_ops {
> >  	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
> >  			      uint32_t guest_irq, bool set);
> >  	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
> > +
> > +	void (*setup_mce)(struct kvm_vcpu *vcpu);
> >  };
> >  
> >  struct kvm_arch_async_pf {
> > @@ -1077,6 +1080,8 @@ extern u8   kvm_tsc_scaling_ratio_frac_bits;
> >  /* maximum allowed value of TSC scaling ratio */
> >  extern u64  kvm_max_tsc_scaling_ratio;
> >  
> > +extern u64 kvm_mce_cap_supported;
> > +
> >  enum emulation_result {
> >  	EMULATE_DONE,         /* no further processing */
> >  	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 1dc89c5..42db42e 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -638,7 +638,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
> >   * feature_control_valid_bits_add/del(), so it's not included here.
> >   */
> >  #define FEATURE_CONTROL_MAX_VALID_BITS \
> > -	FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX
> > +	(FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE)
> >  
> >  static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits)
> >  {
> > @@ -2905,6 +2905,15 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
> >  	return valid_bits && !(val & ~valid_bits);
> >  }
> >  
> > +static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu,
> > +						bool host_initiated)
> > +{
> > +	return (vcpu->arch.mcg_cap & MCG_LMCE_P) &&
> 
> Checking MCG_LMCE_P is unnecessary, because you cannot set
> FEATURE_CONTROL_LMCE unless MCG_LMCE_P is present.
> 
> You can just inline this function in the callers, it's simpler.

I'll remove the first line check and inline the other parts.

> 
> > +		(host_initiated ||
> > +		 (to_vmx(vcpu)->msr_ia32_feature_control &
> > +		  FEATURE_CONTROL_LMCE));
> > +}
> > +
> >  /*
> >   * Reads an msr value (of 'msr_index') into 'pdata'.
> >   * Returns 0 on success, non-0 otherwise.
> > @@ -2946,6 +2955,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  			return 1;
> >  		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
> >  		break;
> > +	case MSR_IA32_MCG_EXT_CTL:
> > +		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
> > +						  msr_info->host_initiated))
> > +			return 1;
> > +		msr_info->data = vcpu->arch.mcg_ext_ctl;
> > +		break;
> >  	case MSR_IA32_FEATURE_CONTROL:
> >  		if (!vmx_feature_control_msr_valid(vcpu, 0))
> >  			return 1;
> > @@ -3039,6 +3054,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  	case MSR_IA32_TSC_ADJUST:
> >  		ret = kvm_set_msr_common(vcpu, msr_info);
> >  		break;
> > +	case MSR_IA32_MCG_EXT_CTL:
> > +		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
> > +						  msr_info->host_initiated) ||
> > +		    (data & ~MCG_EXT_CTL_LMCE_EN))
> > +			return 1;
> > +		vcpu->arch.mcg_ext_ctl = data;
> > +		break;
> >  	case MSR_IA32_FEATURE_CONTROL:
> >  		if (!vmx_feature_control_msr_valid(vcpu, data) ||
> >  		    (to_vmx(vcpu)->msr_ia32_feature_control &
> > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
> >  
> >  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
> >  
> > +	kvm_mce_cap_supported |= MCG_LMCE_P;
> 
> Ah, so virtual LMCE is available on all processors!  This is
> interesting, but it also makes it more complicated to handle in QEMU; a
> new QEMU generally doesn't require a new kernel.
>

My QMEU patch checks KVM MCE capabilities before enabling LMCE (See
lmce_supported() and mce_init() in QEMU Patch 1), so either new QEMU
on old kernel or old QEMU on new kernel will work.

Haozhong

> Eduardo, any ideas?
> 
> Thanks,
> 
> Paolo
> 
> >  	return alloc_kvm_area();
> >  
> >  out8:
> > @@ -10950,6 +10974,14 @@ out:
> >  	return ret;
> >  }
> >  
> > +static void vmx_setup_mce(struct kvm_vcpu *vcpu)
> > +{
> > +	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
> > +		feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE);
> > +	else
> > +		feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE);
> > +}
> > +
> >  static struct kvm_x86_ops vmx_x86_ops = {
> >  	.cpu_has_kvm_support = cpu_has_kvm_support,
> >  	.disabled_by_bios = vmx_disabled_by_bios,
> > @@ -11074,6 +11106,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
> >  	.pmu_ops = &intel_pmu_ops,
> >  
> >  	.update_pi_irte = vmx_update_pi_irte,
> > +
> > +	.setup_mce = vmx_setup_mce,
> >  };
> >  
> >  static int __init vmx_init(void)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index bf22721..5bf76ab 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -70,7 +70,8 @@
> >  
> >  #define MAX_IO_MSRS 256
> >  #define KVM_MAX_MCE_BANKS 32
> > -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> > +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
> > +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
> >  
> >  #define emul_to_vcpu(ctxt) \
> >  	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
> > @@ -983,6 +984,7 @@ static u32 emulated_msrs[] = {
> >  	MSR_IA32_MISC_ENABLE,
> >  	MSR_IA32_MCG_STATUS,
> >  	MSR_IA32_MCG_CTL,
> > +	MSR_IA32_MCG_EXT_CTL,
> >  	MSR_IA32_SMBASE,
> >  };
> >  
> > @@ -2684,11 +2686,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
> >  		break;
> >  	}
> >  	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
> > -		u64 mce_cap;
> > -
> > -		mce_cap = KVM_MCE_CAP_SUPPORTED;
> >  		r = -EFAULT;
> > -		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
> > +		if (copy_to_user(argp, &kvm_mce_cap_supported,
> > +				 sizeof(kvm_mce_cap_supported)))
> >  			goto out;
> >  		r = 0;
> >  		break;
> > @@ -2866,7 +2866,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> >  	r = -EINVAL;
> >  	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
> >  		goto out;
> > -	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
> > +	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
> >  		goto out;
> >  	r = 0;
> >  	vcpu->arch.mcg_cap = mcg_cap;
> > @@ -2876,6 +2876,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> >  	/* Init IA32_MCi_CTL to all 1s */
> >  	for (bank = 0; bank < bank_num; bank++)
> >  		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
> > +
> > +	if (kvm_x86_ops->setup_mce)
> > +		kvm_x86_ops->setup_mce(vcpu);
> >  out:
> >  	return r;
> >  }
> > 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduardo Habkost June 16, 2016, 2:55 p.m. UTC | #3
On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote:
> On 16/06/2016 08:05, Haozhong Zhang wrote:
> > From: Ashok Raj <ashok.raj@intel.com>
> > 
> > On Intel platforms, this patch adds LMCE to KVM MCE supported
> > capabilities and handles guest access to LMCE related MSRs.
> > 
> > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
> >            Only enable LMCE on Intel platform
> > 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> > 	     access to MSR_IA32_MCG_EXT_CTL]
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
[...]
> > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
> >  
> >  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
> >  
> > +	kvm_mce_cap_supported |= MCG_LMCE_P;
> 
> Ah, so virtual LMCE is available on all processors!  This is
> interesting, but it also makes it more complicated to handle in QEMU; a
> new QEMU generally doesn't require a new kernel.
> 
> Eduardo, any ideas?

(CCing libvirt list)

As we shouldn't make machine-type changes introduce new host
requirements, it looks like we need to either add a new set of
CPU models (unreasonable), or expect management software to
explicitly enable LMCE after ensuring the host supports it.

Or we could wait for a reasonable time after the feature is
available in the kernel, and declare that QEMU as a whole
requires a newer kernel. But how much time would be reasonable
for that?

Long term, I believe we should think of a better solution. I
don't think it is reasonable to require new libvirt code to be
written for every single low-level feature that requires a newer
kernel or newer host hardware. Maybe new introspection interfaces
that would allow us to drop the "no new requirements on
machine-type changes" rule?
Haozhong Zhang June 17, 2016, 1:11 a.m. UTC | #4
On 06/16/16 11:55, Eduardo Habkost wrote:
> On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote:
> > On 16/06/2016 08:05, Haozhong Zhang wrote:
> > > From: Ashok Raj <ashok.raj@intel.com>
> > > 
> > > On Intel platforms, this patch adds LMCE to KVM MCE supported
> > > capabilities and handles guest access to LMCE related MSRs.
> > > 
> > > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
> > >            Only enable LMCE on Intel platform
> > > 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> > > 	     access to MSR_IA32_MCG_EXT_CTL]
> > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> [...]
> > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
> > >  
> > >  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
> > >  
> > > +	kvm_mce_cap_supported |= MCG_LMCE_P;
> > 
> > Ah, so virtual LMCE is available on all processors!  This is
> > interesting, but it also makes it more complicated to handle in QEMU; a
> > new QEMU generally doesn't require a new kernel.
> > 
> > Eduardo, any ideas?
> 
> (CCing libvirt list)
> 
> As we shouldn't make machine-type changes introduce new host
> requirements, it looks like we need to either add a new set of
> CPU models (unreasonable), or expect management software to
> explicitly enable LMCE after ensuring the host supports it.
>
> Or we could wait for a reasonable time after the feature is
> available in the kernel, and declare that QEMU as a whole
> requires a newer kernel. But how much time would be reasonable
> for that?
> 
> Long term, I believe we should think of a better solution. I
> don't think it is reasonable to require new libvirt code to be
> written for every single low-level feature that requires a newer
> kernel or newer host hardware. Maybe new introspection interfaces
> that would allow us to drop the "no new requirements on
> machine-type changes" rule?
>

Because new MSR (MSR_IA32_MCG_EXT_CTL) and new MSR bit
(FEATURE_CONTROL_LMCE in MSR_IA32_FEATURE_CONTROL) are introduced by
LMCE, QEMU requires new KVM which can handle those changes.

I'm not familiar with libvirt. Does the requirement of new KVM
capability bring any troubles to libvirt?

Thanks,
Haozhong
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduardo Habkost June 17, 2016, 5:15 p.m. UTC | #5
On Fri, Jun 17, 2016 at 09:11:16AM +0800, Haozhong Zhang wrote:
> On 06/16/16 11:55, Eduardo Habkost wrote:
> > On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote:
> > > On 16/06/2016 08:05, Haozhong Zhang wrote:
> > > > From: Ashok Raj <ashok.raj@intel.com>
> > > > 
> > > > On Intel platforms, this patch adds LMCE to KVM MCE supported
> > > > capabilities and handles guest access to LMCE related MSRs.
> > > > 
> > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
> > > >            Only enable LMCE on Intel platform
> > > > 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> > > > 	     access to MSR_IA32_MCG_EXT_CTL]
> > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > [...]
> > > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
> > > >  
> > > >  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
> > > >  
> > > > +	kvm_mce_cap_supported |= MCG_LMCE_P;
> > > 
> > > Ah, so virtual LMCE is available on all processors!  This is
> > > interesting, but it also makes it more complicated to handle in QEMU; a
> > > new QEMU generally doesn't require a new kernel.
> > > 
> > > Eduardo, any ideas?
> > 
> > (CCing libvirt list)
> > 
> > As we shouldn't make machine-type changes introduce new host
> > requirements, it looks like we need to either add a new set of
> > CPU models (unreasonable), or expect management software to
> > explicitly enable LMCE after ensuring the host supports it.
> >
> > Or we could wait for a reasonable time after the feature is
> > available in the kernel, and declare that QEMU as a whole
> > requires a newer kernel. But how much time would be reasonable
> > for that?
> > 
> > Long term, I believe we should think of a better solution. I
> > don't think it is reasonable to require new libvirt code to be
> > written for every single low-level feature that requires a newer
> > kernel or newer host hardware. Maybe new introspection interfaces
> > that would allow us to drop the "no new requirements on
> > machine-type changes" rule?
> >
> 
> Because new MSR (MSR_IA32_MCG_EXT_CTL) and new MSR bit
> (FEATURE_CONTROL_LMCE in MSR_IA32_FEATURE_CONTROL) are introduced by
> LMCE, QEMU requires new KVM which can handle those changes.

If I understood correctly, you are describing the second option
above (declaring that QEMU as a whole requires a newer kernel).

> 
> I'm not familiar with libvirt. Does the requirement of new KVM
> capability bring any troubles to libvirt?

It does, assuming that we still support running QEMU under an
older kernel where KVM doesn't LMCE. In this case, the pc-2.6
machine-type will run, but the pc-2.7 machine-type won't.

The requirement of new KVM capabilities based on the machine-type
is a problem for livirt. libvirt have some host-capabilities APIs
to allow software to check if the VM can be run on (or migrated
to) a host, but none of them are based on machine-type.

This is not necessarily specific to libvirt: people may have
their own configuration or scripts that use the default "pc"
alias, and a QEMU upgrade shouldn't break their configuration.
Haozhong Zhang June 20, 2016, 1:49 a.m. UTC | #6
On 06/17/16 14:15, Eduardo Habkost wrote:
> On Fri, Jun 17, 2016 at 09:11:16AM +0800, Haozhong Zhang wrote:
> > On 06/16/16 11:55, Eduardo Habkost wrote:
> > > On Thu, Jun 16, 2016 at 12:04:50PM +0200, Paolo Bonzini wrote:
> > > > On 16/06/2016 08:05, Haozhong Zhang wrote:
> > > > > From: Ashok Raj <ashok.raj@intel.com>
> > > > > 
> > > > > On Intel platforms, this patch adds LMCE to KVM MCE supported
> > > > > capabilities and handles guest access to LMCE related MSRs.
> > > > > 
> > > > > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > > > > [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
> > > > >            Only enable LMCE on Intel platform
> > > > > 	   Check MSR_IA32_FEATURE_CONTROL when handling guest
> > > > > 	     access to MSR_IA32_MCG_EXT_CTL]
> > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > [...]
> > > > > @@ -6433,6 +6455,8 @@ static __init int hardware_setup(void)
> > > > >  
> > > > >  	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
> > > > >  
> > > > > +	kvm_mce_cap_supported |= MCG_LMCE_P;
> > > > 
> > > > Ah, so virtual LMCE is available on all processors!  This is
> > > > interesting, but it also makes it more complicated to handle in QEMU; a
> > > > new QEMU generally doesn't require a new kernel.
> > > > 
> > > > Eduardo, any ideas?
> > > 
> > > (CCing libvirt list)
> > > 
> > > As we shouldn't make machine-type changes introduce new host
> > > requirements, it looks like we need to either add a new set of
> > > CPU models (unreasonable), or expect management software to
> > > explicitly enable LMCE after ensuring the host supports it.
> > >
> > > Or we could wait for a reasonable time after the feature is
> > > available in the kernel, and declare that QEMU as a whole
> > > requires a newer kernel. But how much time would be reasonable
> > > for that?
> > > 
> > > Long term, I believe we should think of a better solution. I
> > > don't think it is reasonable to require new libvirt code to be
> > > written for every single low-level feature that requires a newer
> > > kernel or newer host hardware. Maybe new introspection interfaces
> > > that would allow us to drop the "no new requirements on
> > > machine-type changes" rule?
> > >
> > 
> > Because new MSR (MSR_IA32_MCG_EXT_CTL) and new MSR bit
> > (FEATURE_CONTROL_LMCE in MSR_IA32_FEATURE_CONTROL) are introduced by
> > LMCE, QEMU requires new KVM which can handle those changes.
> 
> If I understood correctly, you are describing the second option
> above (declaring that QEMU as a whole requires a newer kernel).
> 
> > 
> > I'm not familiar with libvirt. Does the requirement of new KVM
> > capability bring any troubles to libvirt?
>
> It does, assuming that we still support running QEMU under an
> older kernel where KVM doesn't LMCE. In this case, the pc-2.6
> machine-type will run, but the pc-2.7 machine-type won't.
>
> The requirement of new KVM capabilities based on the machine-type
> is a problem for livirt. libvirt have some host-capabilities APIs
> to allow software to check if the VM can be run on (or migrated
> to) a host, but none of them are based on machine-type.
> 
> This is not necessarily specific to libvirt: people may have
> their own configuration or scripts that use the default "pc"
> alias, and a QEMU upgrade shouldn't break their configuration.
>

Thanks for the explanation!

If we disable LMCE in QEMU by default (even for -cpu host), will it
still be a problem? That is,

- pc-2.7 can continue to run on old kernels unless users explicitly
  require LMCE

- existing libvirt VM configurations can continue to work on pc-2.7
  because LMCE is not specified in those configurations and are
  disabled by default (i.e. no requirement for new kernels)

- existing QEMU configurations/scripts using pc alias can continue to
  work on pc-27 for the same reason above.

Thanks,
Haozhong
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 20, 2016, 6:55 a.m. UTC | #7
On 20/06/2016 03:49, Haozhong Zhang wrote:
> Thanks for the explanation!
> 
> If we disable LMCE in QEMU by default (even for -cpu host), will it
> still be a problem? That is,
> 
> - pc-2.7 can continue to run on old kernels unless users explicitly
>   require LMCE
> 
> - existing libvirt VM configurations can continue to work on pc-2.7
>   because LMCE is not specified in those configurations and are
>   disabled by default (i.e. no requirement for new kernels)
> 
> - existing QEMU configurations/scripts using pc alias can continue to
>   work on pc-27 for the same reason above.

Yes, that would be fine.  "-cpu host" can leave LMCE enabled if
supported by the kernel.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e0fbe7e..75defa6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,6 +598,7 @@  struct kvm_vcpu_arch {
 	u64 mcg_cap;
 	u64 mcg_status;
 	u64 mcg_ctl;
+	u64 mcg_ext_ctl;
 	u64 *mce_banks;
 
 	/* Cache MMIO info */
@@ -1005,6 +1006,8 @@  struct kvm_x86_ops {
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
 	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+
+	void (*setup_mce)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1077,6 +1080,8 @@  extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
 
+extern u64 kvm_mce_cap_supported;
+
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
 	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1dc89c5..42db42e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -638,7 +638,7 @@  static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
  * feature_control_valid_bits_add/del(), so it's not included here.
  */
 #define FEATURE_CONTROL_MAX_VALID_BITS \
-	FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX
+	(FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX | FEATURE_CONTROL_LMCE)
 
 static void feature_control_valid_bits_add(struct kvm_vcpu *vcpu, uint64_t bits)
 {
@@ -2905,6 +2905,15 @@  static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
 	return valid_bits && !(val & ~valid_bits);
 }
 
+static inline bool vmx_mcg_ext_ctrl_msr_present(struct kvm_vcpu *vcpu,
+						bool host_initiated)
+{
+	return (vcpu->arch.mcg_cap & MCG_LMCE_P) &&
+		(host_initiated ||
+		 (to_vmx(vcpu)->msr_ia32_feature_control &
+		  FEATURE_CONTROL_LMCE));
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -2946,6 +2955,12 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
+	case MSR_IA32_MCG_EXT_CTL:
+		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
+						  msr_info->host_initiated))
+			return 1;
+		msr_info->data = vcpu->arch.mcg_ext_ctl;
+		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		if (!vmx_feature_control_msr_valid(vcpu, 0))
 			return 1;
@@ -3039,6 +3054,13 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC_ADJUST:
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
+	case MSR_IA32_MCG_EXT_CTL:
+		if (!vmx_mcg_ext_ctrl_msr_present(vcpu,
+						  msr_info->host_initiated) ||
+		    (data & ~MCG_EXT_CTL_LMCE_EN))
+			return 1;
+		vcpu->arch.mcg_ext_ctl = data;
+		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		if (!vmx_feature_control_msr_valid(vcpu, data) ||
 		    (to_vmx(vcpu)->msr_ia32_feature_control &
@@ -6433,6 +6455,8 @@  static __init int hardware_setup(void)
 
 	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
+	kvm_mce_cap_supported |= MCG_LMCE_P;
+
 	return alloc_kvm_area();
 
 out8:
@@ -10950,6 +10974,14 @@  out:
 	return ret;
 }
 
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+		feature_control_valid_bits_add(vcpu, FEATURE_CONTROL_LMCE);
+	else
+		feature_control_valid_bits_del(vcpu, FEATURE_CONTROL_LMCE);
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -11074,6 +11106,8 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.pmu_ops = &intel_pmu_ops,
 
 	.update_pi_irte = vmx_update_pi_irte,
+
+	.setup_mce = vmx_setup_mce,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf22721..5bf76ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -70,7 +70,8 @@ 
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
 	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -983,6 +984,7 @@  static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
+	MSR_IA32_MCG_EXT_CTL,
 	MSR_IA32_SMBASE,
 };
 
@@ -2684,11 +2686,9 @@  long kvm_arch_dev_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-		u64 mce_cap;
-
-		mce_cap = KVM_MCE_CAP_SUPPORTED;
 		r = -EFAULT;
-		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+		if (copy_to_user(argp, &kvm_mce_cap_supported,
+				 sizeof(kvm_mce_cap_supported)))
 			goto out;
 		r = 0;
 		break;
@@ -2866,7 +2866,7 @@  static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
-	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
 		goto out;
 	r = 0;
 	vcpu->arch.mcg_cap = mcg_cap;
@@ -2876,6 +2876,9 @@  static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	/* Init IA32_MCi_CTL to all 1s */
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+	if (kvm_x86_ops->setup_mce)
+		kvm_x86_ops->setup_mce(vcpu);
 out:
 	return r;
 }