diff mbox

Add MCE support to KVM

Message ID 1239155601.6384.3.camel@yhuang-dev.sh.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Huang, Ying April 8, 2009, 1:53 a.m. UTC
Add MCE support to KVM. The related MSRs are emulated. A new vcpu
ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as
the mcg_cap. MCE is injected via vcpu ioctl command
KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
not implemented.

Signed-off-by: Huang Ying <ying.huang@intel.com>

---
 arch/x86/include/asm/kvm_host.h |    5 
 arch/x86/include/asm/mce.h      |    1 
 arch/x86/kvm/x86.c              |  202 +++++++++++++++++++++++++++++++++++-----
 include/linux/kvm.h             |   15 ++
 4 files changed, 199 insertions(+), 24 deletions(-)

Comments

Avi Kivity April 9, 2009, 3:50 p.m. UTC | #1
Huang Ying wrote:
> Add MCE support to KVM. The related MSRs are emulated. A new vcpu
> ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as
> the mcg_cap. MCE is injected via vcpu ioctl command
> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
> not implemented.
>
> Signed-off-by: Huang Ying <ying.huang@intel.com>
>
> ---
>  arch/x86/include/asm/kvm_host.h |    5 
>  arch/x86/include/asm/mce.h      |    1 
>  arch/x86/kvm/x86.c              |  202 +++++++++++++++++++++++++++++++++++-----
>  include/linux/kvm.h             |   15 ++
>  4 files changed, 199 insertions(+), 24 deletions(-)
>
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -42,6 +42,7 @@
>  #include <asm/msr.h>
>  #include <asm/desc.h>
>  #include <asm/mtrr.h>
> +#include <asm/mce.h>
>  
>  #define MAX_IO_MSRS 256
>  #define CR0_RESERVED_BITS						\
> @@ -734,23 +735,43 @@ static int set_msr_mtrr(struct kvm_vcpu 
>  	return 0;
>  }
>  
> -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  {
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned bank_num = mcg_cap & 0xff;
> +
>  	switch (msr) {
> -	case MSR_EFER:
> -		set_efer(vcpu, data);
> -		break;
> -	case MSR_IA32_MC0_STATUS:
> -		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
> -		       __func__, data);
> -		break;
>  	case MSR_IA32_MCG_STATUS:
> -		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
> -			__func__, data);
> +		vcpu->arch.mcg_status = data;
>  		break;
>  	case MSR_IA32_MCG_CTL:
> -		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
> -			__func__, data);
> +		if (!(mcg_cap & MCG_CTL_P))
> +			return 1;
> +		if (data != 0 && data != ~(u64)0)
> +			return -1;
> +		vcpu->arch.mcg_ctl = data;
> +		break;
> +	default:
> +		if (msr >= MSR_IA32_MC0_CTL &&
> +		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
> +			u32 offset = msr - MSR_IA32_MC0_CTL;
> +			/* only 0 or all 1s can be written to IA32_MCi_CTL */
> +			if ((offset & 0x3) == 0 &&
> +			    data != 0 && data != ~(u64)0)
> +				return -1;
> +			vcpu->arch.mce_banks[offset] = data;
> +			break;
> +		}
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> +{
> +	switch (msr) {
> +	case MSR_EFER:
> +		set_efer(vcpu, data);
>  		break;
>  	case MSR_IA32_DEBUGCTLMSR:
>  		if (!data) {
> @@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
>  		break;
>  	}
>  	default:
> +		if (!set_msr_mce(vcpu, msr, data))
> +		    break;
>  		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
>  		return 1;
>  	}
>   

Is there any reason you split kvm_set_msr_common() into two functions?

>  
> -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>  {
>  	u64 data;
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned bank_num = mcg_cap & 0xff;
>  
>  	switch (msr) {
> -	case 0xc0010010: /* SYSCFG */
> -	case 0xc0010015: /* HWCR */
> -	case MSR_IA32_PLATFORM_ID:
>  	case MSR_IA32_P5_MC_ADDR:
>  	case MSR_IA32_P5_MC_TYPE:
> -	case MSR_IA32_MC0_CTL:
> -	case MSR_IA32_MCG_STATUS:
> +		data = 0;
> +		break;
>  	case MSR_IA32_MCG_CAP:
> +		data = vcpu->arch.mcg_cap;
> +		break;
>  	case MSR_IA32_MCG_CTL:
> -	case MSR_IA32_MC0_MISC:
> -	case MSR_IA32_MC0_MISC+4:
> -	case MSR_IA32_MC0_MISC+8:
> -	case MSR_IA32_MC0_MISC+12:
> -	case MSR_IA32_MC0_MISC+16:
> -	case MSR_IA32_MC0_MISC+20:
> +		if (!(mcg_cap & MCG_CTL_P))
> +			return 1;
> +		data = vcpu->arch.mcg_ctl;
> +		break;
> +	case MSR_IA32_MCG_STATUS:
> +		data = vcpu->arch.mcg_status;
> +		break;
> +	default:
> +		if (msr >= MSR_IA32_MC0_CTL &&
> +		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
> +			u32 offset = msr - MSR_IA32_MC0_CTL;
> +			data = vcpu->arch.mce_banks[offset];
> +			break;
> +		}
> +		return 1;
> +	}
> +	*pdata = data;
> +	return 0;
> +}
> +
> +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> +{
> +	u64 data;
> +
> +	switch (msr) {
> +	case 0xc0010010: /* SYSCFG */
> +	case 0xc0010015: /* HWCR */
>   

Please use MSR_ constants (add them if they don't exist yet).

> +	case MSR_IA32_PLATFORM_ID:
>  	case MSR_IA32_UCODE_REV:
>  	case MSR_IA32_EBL_CR_POWERON:
>  	case MSR_IA32_DEBUGCTLMSR:
> @@ -921,6 +967,8 @@ int kvm_get_msr_common(struct kvm_vcpu *
>  		data = vcpu->arch.time;
>  		break;
>  	default:
> +		if (!get_msr_mce(vcpu, msr, &data))
> +			break;
>  		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
>  		return 1;
>  	}
> @@ -1443,6 +1491,87 @@ static int vcpu_ioctl_tpr_access_reporti
>  	return 0;
>  }
>   

Again, why split?

>  
> +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> +					u64 mcg_cap)
> +{
> +	int r;
> +	unsigned bank_num = mcg_cap & 0xff, bank;
> +	u64 *banks;
> +
> +	r = -EINVAL;
> +	if (!bank_num)
> +		goto out;
> +	r = -ENOMEM;
> +	banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL);
>   

If banks is already allocated, you'll leak it.

Why not always allocate it on vcpu setup?

> +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> +				      struct kvm_x86_mce *mce)
> +{
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned bank_num = mcg_cap & 0xff;
> +	u64 *banks = vcpu->arch.mce_banks;
> +
> +	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
> +		return -EINVAL;
> +	/*
> +	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
> +	 * reporting is disabled
> +	 */
> +	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
> +	    vcpu->arch.mcg_ctl != ~(u64)0)
> +		return 0;
> +	banks += 4 * mce->bank;
> +	/*
> +	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
> +	 * reporting is disabled for the bank
> +	 */
> +	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
> +		return 0;
> +	if (mce->status & MCI_STATUS_UC) {
> +		u64 status = mce->status;
> +		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
> +		    !(vcpu->arch.cr4 & X86_CR4_MCE)) {
> +			printk(KERN_DEBUG "kvm: set_mce: "
> +			       "injects mce exception while "
> +			       "previous one is in progress!\n");
> +			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
> +			return 0;
> +		}
> +		if (banks[1] & MCI_STATUS_VAL)
> +			status |= MCI_STATUS_OVER;
> +		banks[1] = mce->status;
> +		banks[2] = mce->addr;
> +		banks[3] = mce->misc;
> +		vcpu->arch.mcg_status = mce->mcg_status;
> +		kvm_queue_exception(vcpu, MC_VECTOR);
> +	} else if (!(banks[1] & MCI_STATUS_VAL) ||
> +		   (!(banks[1] & MCI_STATUS_UC) &&
> +		    !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
> +		u64 status = mce->status;
> +		if (banks[1] & MCI_STATUS_VAL)
> +			status |= MCI_STATUS_OVER;
> +		banks[1] = mce->status;
> +		banks[2] = mce->addr;
> +		banks[3] = mce->misc;
> +	} else
> +		banks[1] |= MCI_STATUS_OVER;
> +	return 0;
> +}
>   

Can userspace just use KVM_SET_MSR for this?

> +	case KVM_X86_SETUP_MCE: {
> +		u64 mcg_cap;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
> +			goto out;
> +		/*
> +		 * extended machine-check state registers and CMCI are
> +		 * not supported.
> +		 */
> +		mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
>   

Instead of silently dropping, should return an error.

> +		if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
> +			goto out;
>   

And not copy.



>  #define KVM_TRC_SHIFT           16
>  /*
>   * kvm trace categories
> @@ -528,6 +540,9 @@ struct kvm_irq_routing {
>  #define KVM_NMI                   _IO(KVMIO,  0x9a)
>  /* Available with KVM_CAP_SET_GUEST_DEBUG */
>  #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
> +/* MCE for x86 */
> +#define KVM_X86_SETUP_MCE           _IOWR(KVMIO,  0x9a, __u64)
> +#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9b, struct kvm_x86_mce)
>   

Please add a KVM_CAP_ to advertise this capability.
Huang, Ying April 10, 2009, 3 a.m. UTC | #2
On Thu, 2009-04-09 at 23:50 +0800, Avi Kivity wrote:
> Huang Ying wrote:
> > +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> > +{
> > +	switch (msr) {
> > +	case MSR_EFER:
> > +		set_efer(vcpu, data);
> >  		break;
> >  	case MSR_IA32_DEBUGCTLMSR:
> >  		if (!data) {
> > @@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
> >  		break;
> >  	}
> >  	default:
> > +		if (!set_msr_mce(vcpu, msr, data))
> > +		    break;
> >  		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
> >  		return 1;
> >  	}
> >   
> 
> Is there any reason you split kvm_set_msr_common() into two functions?

I want to group MCE related MSR together. And most MCE MSR read/write
need to access vcpu->arch.mcg_xxx or vcpu->arch_mce_banks, So I think
use a MCE specific function would be cleaner.

But It seems that something as follow would be better.

kvm_set_msr_comm()
{
	switch (msr) {
	case MSR_IA32_P5_MC_ADDR:
	case MSR_IA32_P5_MC_TYPE:
	case MSR_IA32_MCG_CAP:
	case MSR_IA32_MCG_CTL:
	case MSR_IA32_MCG_STATUS:
	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_MISC + 4 * KVM_MCE_MAX_BANK:
		set_msr_mce();
		break;
	...
	}
	...
}

> > +
> > +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> > +{
> > +	u64 data;
> > +
> > +	switch (msr) {
> > +	case 0xc0010010: /* SYSCFG */
> > +	case 0xc0010015: /* HWCR */
> >   
> 
> Please use MSR_ constants (add them if they don't exist yet).

In fact, this is not added by me. But I can change this by the way.

> >  
> > +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> > +					u64 mcg_cap)
> > +{
> > +	int r;
> > +	unsigned bank_num = mcg_cap & 0xff, bank;
> > +	u64 *banks;
> > +
> > +	r = -EINVAL;
> > +	if (!bank_num)
> > +		goto out;
> > +	r = -ENOMEM;
> > +	banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL);
> >   
> 
> If banks is already allocated, you'll leak it.

Yes. I will fix this.

> Why not always allocate it on vcpu setup?

Because the MCE bank number is not fixed, it is based on mcg_cap from
user space.

> > +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> > +				      struct kvm_x86_mce *mce)
> > +{
> > +	u64 mcg_cap = vcpu->arch.mcg_cap;
> > +	unsigned bank_num = mcg_cap & 0xff;
> > +	u64 *banks = vcpu->arch.mce_banks;
> > +
> > +	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
> > +		return -EINVAL;
> > +	/*
> > +	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
> > +	 * reporting is disabled
> > +	 */
> > +	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
> > +	    vcpu->arch.mcg_ctl != ~(u64)0)
> > +		return 0;
> > +	banks += 4 * mce->bank;
> > +	/*
> > +	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
> > +	 * reporting is disabled for the bank
> > +	 */
> > +	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
> > +		return 0;
> > +	if (mce->status & MCI_STATUS_UC) {
> > +		u64 status = mce->status;
> > +		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
> > +		    !(vcpu->arch.cr4 & X86_CR4_MCE)) {
> > +			printk(KERN_DEBUG "kvm: set_mce: "
> > +			       "injects mce exception while "
> > +			       "previous one is in progress!\n");
> > +			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
> > +			return 0;
> > +		}
> > +		if (banks[1] & MCI_STATUS_VAL)
> > +			status |= MCI_STATUS_OVER;
> > +		banks[1] = mce->status;
> > +		banks[2] = mce->addr;
> > +		banks[3] = mce->misc;
> > +		vcpu->arch.mcg_status = mce->mcg_status;
> > +		kvm_queue_exception(vcpu, MC_VECTOR);
> > +	} else if (!(banks[1] & MCI_STATUS_VAL) ||
> > +		   (!(banks[1] & MCI_STATUS_UC) &&
> > +		    !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
> > +		u64 status = mce->status;
> > +		if (banks[1] & MCI_STATUS_VAL)
> > +			status |= MCI_STATUS_OVER;
> > +		banks[1] = mce->status;
> > +		banks[2] = mce->addr;
> > +		banks[3] = mce->misc;
> > +	} else
> > +		banks[1] |= MCI_STATUS_OVER;
> > +	return 0;
> > +}
> >   
> 
> Can userspace just use KVM_SET_MSR for this?

In addition to assigning MSR, we have some other logic for MCE, such as
BANK reporting disabling, overwriting rules, triple fault for UC MCE
during MCIP. So I think we need some dedicate interface.

> > +	case KVM_X86_SETUP_MCE: {
> > +		u64 mcg_cap;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
> > +			goto out;
> > +		/*
> > +		 * extended machine-check state registers and CMCI are
> > +		 * not supported.
> > +		 */
> > +		mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
> >   
> 
> Instead of silently dropping, should return an error.
> 
> > +		if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
> > +			goto out;
> >   
> 
> And not copy.

This is designed as some kind of feature negotiating. Use space request
MCE features via mcg_cap, kernel space remove un-supported features and
return the resulting mcg_cap.

> 
> >  #define KVM_TRC_SHIFT           16
> >  /*
> >   * kvm trace categories
> > @@ -528,6 +540,9 @@ struct kvm_irq_routing {
> >  #define KVM_NMI                   _IO(KVMIO,  0x9a)
> >  /* Available with KVM_CAP_SET_GUEST_DEBUG */
> >  #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
> > +/* MCE for x86 */
> > +#define KVM_X86_SETUP_MCE           _IOWR(KVMIO,  0x9a, __u64)
> > +#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9b, struct kvm_x86_mce)
> >   
> 
> Please add a KVM_CAP_ to advertise this capability.

Yes. I will do this.

Best Regards,
Huang Ying
Avi Kivity April 11, 2009, 12:04 p.m. UTC | #3
Huang Ying wrote:
> On Thu, 2009-04-09 at 23:50 +0800, Avi Kivity wrote:
>   
>> Huang Ying wrote:
>>     
>>> +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>> +{
>>> +	switch (msr) {
>>> +	case MSR_EFER:
>>> +		set_efer(vcpu, data);
>>>  		break;
>>>  	case MSR_IA32_DEBUGCTLMSR:
>>>  		if (!data) {
>>> @@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
>>>  		break;
>>>  	}
>>>  	default:
>>> +		if (!set_msr_mce(vcpu, msr, data))
>>> +		    break;
>>>  		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
>>>  		return 1;
>>>  	}
>>>   
>>>       
>> Is there any reason you split kvm_set_msr_common() into two functions?
>>     
>
> I want to group MCE related MSR together. And most MCE MSR read/write
> need to access vcpu->arch.mcg_xxx or vcpu->arch_mce_banks, So I think
> use a MCE specific function would be cleaner.
>
> But It seems that something as follow would be better.
>
> kvm_set_msr_comm()
> {
> 	switch (msr) {
> 	case MSR_IA32_P5_MC_ADDR:
> 	case MSR_IA32_P5_MC_TYPE:
> 	case MSR_IA32_MCG_CAP:
> 	case MSR_IA32_MCG_CTL:
> 	case MSR_IA32_MCG_STATUS:
> 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_MISC + 4 * KVM_MCE_MAX_BANK:
> 		set_msr_mce();
> 		break;
> 	...
> 	}
> 	...
> }
>
>   

Yes.  Just make sure KVM_MCE_MAX_BANK (better change to KVM_MCE_NR_BANK, 
with MAX you never know if it's the index of the last bank or the number 
of banks) doesn't conflict with any other MSRs.

>>> +
>>> +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>>> +{
>>> +	u64 data;
>>> +
>>> +	switch (msr) {
>>> +	case 0xc0010010: /* SYSCFG */
>>> +	case 0xc0010015: /* HWCR */
>>>   
>>>       
>> Please use MSR_ constants (add them if they don't exist yet).
>>     
>
> In fact, this is not added by me. But I can change this by the way.
>   

Oh okay.  So don't change them in this patch.

>> Why not always allocate it on vcpu setup?
>>     
>
> Because the MCE bank number is not fixed, it is based on mcg_cap from
> user space.
>   

Right, but we can allocate the maximum number, no?  it's a fairly small 
amount of memory.

>   
>>> +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
>>> +				      struct kvm_x86_mce *mce)
>>> +{
>>> +	u64 mcg_cap = vcpu->arch.mcg_cap;
>>> +	unsigned bank_num = mcg_cap & 0xff;
>>> +	u64 *banks = vcpu->arch.mce_banks;
>>> +
>>> +	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
>>> +		return -EINVAL;
>>> +	/*
>>> +	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
>>> +	 * reporting is disabled
>>> +	 */
>>> +	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
>>> +	    vcpu->arch.mcg_ctl != ~(u64)0)
>>> +		return 0;
>>> +	banks += 4 * mce->bank;
>>> +	/*
>>> +	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
>>> +	 * reporting is disabled for the bank
>>> +	 */
>>> +	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
>>> +		return 0;
>>> +	if (mce->status & MCI_STATUS_UC) {
>>> +		u64 status = mce->status;
>>> +		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
>>> +		    !(vcpu->arch.cr4 & X86_CR4_MCE)) {
>>> +			printk(KERN_DEBUG "kvm: set_mce: "
>>> +			       "injects mce exception while "
>>> +			       "previous one is in progress!\n");
>>> +			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
>>> +			return 0;
>>> +		}
>>> +		if (banks[1] & MCI_STATUS_VAL)
>>> +			status |= MCI_STATUS_OVER;
>>> +		banks[1] = mce->status;
>>> +		banks[2] = mce->addr;
>>> +		banks[3] = mce->misc;
>>> +		vcpu->arch.mcg_status = mce->mcg_status;
>>> +		kvm_queue_exception(vcpu, MC_VECTOR);
>>> +	} else if (!(banks[1] & MCI_STATUS_VAL) ||
>>> +		   (!(banks[1] & MCI_STATUS_UC) &&
>>> +		    !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
>>> +		u64 status = mce->status;
>>> +		if (banks[1] & MCI_STATUS_VAL)
>>> +			status |= MCI_STATUS_OVER;
>>> +		banks[1] = mce->status;
>>> +		banks[2] = mce->addr;
>>> +		banks[3] = mce->misc;
>>> +	} else
>>> +		banks[1] |= MCI_STATUS_OVER;
>>> +	return 0;
>>> +}
>>>   
>>>       
>> Can userspace just use KVM_SET_MSR for this?
>>     
>
> In addition to assigning MSR, we have some other logic for MCE, such as
> BANK reporting disabling, overwriting rules, triple fault for UC MCE
> during MCIP. So I think we need some dedicate interface.
>   

Yes, you're right.

>   
>>> +	case KVM_X86_SETUP_MCE: {
>>> +		u64 mcg_cap;
>>> +
>>> +		r = -EFAULT;
>>> +		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
>>> +			goto out;
>>> +		/*
>>> +		 * extended machine-check state registers and CMCI are
>>> +		 * not supported.
>>> +		 */
>>> +		mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
>>>   
>>>       
>> Instead of silently dropping, should return an error.
>>
>>     
>>> +		if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
>>> +			goto out;
>>>   
>>>       
>> And not copy.
>>     
>
> This is designed as some kind of feature negotiating. Use space request
> MCE features via mcg_cap, kernel space remove un-supported features and
> return the resulting mcg_cap.
>   

kvm does feature negotiation (really, feature advertising) using 
KVM_CAP_... and KVM_CHECK_EXTENSION.  So there's no need for this.
Andi Kleen April 11, 2009, 12:19 p.m. UTC | #4
> Right, but we can allocate the maximum number, no?  it's a fairly small 
> amount of memory.

There are 255 banks worst case which need upto 4 (in theory 5) MSRs each

-Andi
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 11, 2009, 12:25 p.m. UTC | #5
Andi Kleen wrote:
>> Right, but we can allocate the maximum number, no?  it's a fairly small 
>> amount of memory.
>>     
>
> There are 255 banks worst case which need upto 4 (in theory 5) MSRs each
>
>   

That's 8KB, which would normally not be allocated.  But I think we can 
live with it.

Do we actually need to support 255 banks?  Or can we support a smaller 
number transparently?

(IOW: are the banks interchangable?)
Huang, Ying April 13, 2009, 2:41 a.m. UTC | #6
On Sat, 2009-04-11 at 20:04 +0800, Avi Kivity wrote:
> >> Why not always allocate it on vcpu setup?
> >>     
> >
> > Because the MCE bank number is not fixed, it is based on mcg_cap from
> > user space.
> >   
> 
> Right, but we can allocate the maximum number, no?  it's a fairly small 
> amount of memory.

OK. If you are OK with 8k extra memory usage.
   
> >   
> >>> +	case KVM_X86_SETUP_MCE: {
> >>> +		u64 mcg_cap;
> >>> +
> >>> +		r = -EFAULT;
> >>> +		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
> >>> +			goto out;
> >>> +		/*
> >>> +		 * extended machine-check state registers and CMCI are
> >>> +		 * not supported.
> >>> +		 */
> >>> +		mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
> >>>   
> >>>       
> >> Instead of silently dropping, should return an error.
> >>
> >>     
> >>> +		if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
> >>> +			goto out;
> >>>   
> >>>       
> >> And not copy.
> >>     
> >
> > This is designed as some kind of feature negotiating. Use space request
> > MCE features via mcg_cap, kernel space remove un-supported features and
> > return the resulting mcg_cap.
> >   
> 
> kvm does feature negotiation (really, feature advertising) using 
> KVM_CAP_... and KVM_CHECK_EXTENSION.  So there's no need for this.

It is not only feature advertising, but also something like CPUID. Based
on CPU type specified in command line (qemu -cpu ?), different mcg_cap
can be used to determine number of MCE banks and various features of MCE
(MCG_CTL_P, MCG_TES_P, MCG_CMCI, etc) for specified CPU type. With KVM
feature advertising, we can only check whether MCE simulation is
supported, but can not set parameters of MCE simulation.

Maybe this interface can be changed to KVM_SET_MSRS, with
MSR_IA32_MCG_CAP be set by kvm-qemu. But MSR_IA32_MCG_CAP is read-only
by spec, and current interface can not distinguishes between MSR set
from guest system and MSR setup from kvm-qemu. I think there should be
other read-only MSRs need to be setup during guest system
creation/setup. How about add a interface such as KVM_SETUP_MSRS to
setup the value of read-only MSRs?

Best Regards,
Huang Ying
Andi Kleen April 13, 2009, 8:26 a.m. UTC | #7
On Sat, Apr 11, 2009 at 03:25:19PM +0300, Avi Kivity wrote:
> Andi Kleen wrote:
> >>Right, but we can allocate the maximum number, no?  it's a fairly small 
> >>amount of memory.
> >>    
> >
> >There are 255 banks worst case which need upto 4 (in theory 5) MSRs each
> >
> >  
> 
> That's 8KB, which would normally not be allocated.  But I think we can 
> live with it.
> 
> Do we actually need to support 255 banks? 

Probably not. Current CPUs all have a single digit number.

e.g. 64 would be plenty for now.

> Or can we support a smaller 
> number transparently?

Not transparently. In theory software can hardcode that for cpu model X bank Y 
is available and does Z. Software doesn't have too, Intel provides a bank 
independent way to report errors too, but we don't know if everyone uses it.
Linux doesn't rely on bank numbers on Intel at least.

-Andi
Avi Kivity April 13, 2009, 1:02 p.m. UTC | #8
Huang Ying wrote:
> On Sat, 2009-04-11 at 20:04 +0800, Avi Kivity wrote:
>   
>>>> Why not always allocate it on vcpu setup?
>>>>     
>>>>         
>>> Because the MCE bank number is not fixed, it is based on mcg_cap from
>>> user space.
>>>   
>>>       
>> Right, but we can allocate the maximum number, no?  it's a fairly small 
>> amount of memory.
>>     
>
> OK. If you are OK with 8k extra memory usage.
>   

Let's make it 64 banks like Andi suggests.  Return the number in the 
KVM_CHECK_EXTENSION call (we do the same for memory slots) so userspace 
can adjust.

>>
>> kvm does feature negotiation (really, feature advertising) using 
>> KVM_CAP_... and KVM_CHECK_EXTENSION.  So there's no need for this.
>>     
>
> It is not only feature advertising, but also something like CPUID. Based
> on CPU type specified in command line (qemu -cpu ?), different mcg_cap
> can be used to determine number of MCE banks and various features of MCE
> (MCG_CTL_P, MCG_TES_P, MCG_CMCI, etc) for specified CPU type. With KVM
> feature advertising, we can only check whether MCE simulation is
> supported, but can not set parameters of MCE simulation.
>
> Maybe this interface can be changed to KVM_SET_MSRS, with
> MSR_IA32_MCG_CAP be set by kvm-qemu. But MSR_IA32_MCG_CAP is read-only
> by spec, and current interface can not distinguishes between MSR set
> from guest system and MSR setup from kvm-qemu. I think there should be
> other read-only MSRs need to be setup during guest system
> creation/setup. How about add a interface such as KVM_SETUP_MSRS to
> setup the value of read-only MSRs?
>   

Then we would need to tell which read-only MSRs are setup writeable and 
which aren't...

I'm okay with an ioctl to setup MCE, but just make sure userspace has 
all the information to know what the kernel can do rather than the 
try-and-see-if-it-works approach.  We can publish this information via 
KVM_CAP things, or via another ioctl (see KVM_GET_SUPPORTED_CPUID2 for 
an example).
Huang, Ying April 14, 2009, 2:04 a.m. UTC | #9
On Mon, 2009-04-13 at 21:02 +0800, Avi Kivity wrote:
> Huang Ying wrote:
> > On Sat, 2009-04-11 at 20:04 +0800, Avi Kivity wrote:
> >   
> >>>> Why not always allocate it on vcpu setup?
> >>>>     
> >>>>         
> >>> Because the MCE bank number is not fixed, it is based on mcg_cap from
> >>> user space.
> >>>   
> >>>       
> >> Right, but we can allocate the maximum number, no?  it's a fairly small 
> >> amount of memory.
> >>     
> >
> > OK. If you are OK with 8k extra memory usage.
> >   
> 
> Let's make it 64 banks like Andi suggests.  Return the number in the 
> KVM_CHECK_EXTENSION call (we do the same for memory slots) so userspace 
> can adjust.

OK. I will change this.

> >>
> >> kvm does feature negotiation (really, feature advertising) using 
> >> KVM_CAP_... and KVM_CHECK_EXTENSION.  So there's no need for this.
> >>     
> >
> > It is not only feature advertising, but also something like CPUID. Based
> > on CPU type specified in command line (qemu -cpu ?), different mcg_cap
> > can be used to determine number of MCE banks and various features of MCE
> > (MCG_CTL_P, MCG_TES_P, MCG_CMCI, etc) for specified CPU type. With KVM
> > feature advertising, we can only check whether MCE simulation is
> > supported, but can not set parameters of MCE simulation.
> >
> > Maybe this interface can be changed to KVM_SET_MSRS, with
> > MSR_IA32_MCG_CAP be set by kvm-qemu. But MSR_IA32_MCG_CAP is read-only
> > by spec, and current interface can not distinguishes between MSR set
> > from guest system and MSR setup from kvm-qemu. I think there should be
> > other read-only MSRs need to be setup during guest system
> > creation/setup. How about add a interface such as KVM_SETUP_MSRS to
> > setup the value of read-only MSRs?
> >   
> 
> Then we would need to tell which read-only MSRs are setup writeable and 
> which aren't...

Maybe something like only if KVM_CAP_MCE is advertised, the MCE related
read-only RO-MSRs can be setup. Other RO-MSRs setup code can be added
following the similar rules.

And interface is something like follow?

long kvm_arch_vcpu_ioctl(...)
{
    ...
    case KVM_SETUP_RO_MSRS:
        r = msr_io(vcpu, argp, kvm_setup_ro_msr, 0);
    ...
}

int kvm_setup_ro_msr(...)
{
    /* return 1 for setup un-writable MSR, otherwise 0 */
}

> I'm okay with an ioctl to setup MCE, but just make sure userspace has 
> all the information to know what the kernel can do rather than the 
> try-and-see-if-it-works approach.  We can publish this information via 
> KVM_CAP things, or via another ioctl (see KVM_GET_SUPPORTED_CPUID2 for 
> an example).

Yes. MCE support should be published by KVM_CAP_MCE and other features
can be published via reading the default value of MSR_IA32_MCG_CAP.

Best Regards,
Huang Ying
Avi Kivity April 14, 2009, 10:45 a.m. UTC | #10
Huang Ying wrote:
>> I'm okay with an ioctl to setup MCE, but just make sure userspace has 
>> all the information to know what the kernel can do rather than the 
>> try-and-see-if-it-works approach.  We can publish this information via 
>> KVM_CAP things, or via another ioctl (see KVM_GET_SUPPORTED_CPUID2 for 
>> an example).
>>     
>
> Yes. MCE support should be published by KVM_CAP_MCE and other features
> can be published via reading the default value of MSR_IA32_MCG_CAP.
>   

A problem with this is that you can only read an MSR after a vcpu has 
been created.  But if you're writing a program to detect what features 
are available (for example, when checking features common to a migration 
pool), you don't want to create a vpcu (you could, but it's hacky).
Huang, Ying April 15, 2009, 7:24 a.m. UTC | #11
On Tue, 2009-04-14 at 18:45 +0800, Avi Kivity wrote:
> Huang Ying wrote:
> >> I'm okay with an ioctl to setup MCE, but just make sure userspace has 
> >> all the information to know what the kernel can do rather than the 
> >> try-and-see-if-it-works approach.  We can publish this information via 
> >> KVM_CAP things, or via another ioctl (see KVM_GET_SUPPORTED_CPUID2 for 
> >> an example).
> >>     
> >
> > Yes. MCE support should be published by KVM_CAP_MCE and other features
> > can be published via reading the default value of MSR_IA32_MCG_CAP.
> >   
> 
> A problem with this is that you can only read an MSR after a vcpu has 
> been created.  But if you're writing a program to detect what features 
> are available (for example, when checking features common to a migration 
> pool), you don't want to create a vpcu (you could, but it's hacky).

Yes. You are right. I will change this as you said, something like
KVM_GET_SUPPORTED_CPUID2.

Best Regards,
Huang Ying
Anthony Liguori April 18, 2009, 10:17 p.m. UTC | #12
Avi Kivity wrote: 
>
> Then we would need to tell which read-only MSRs are setup writeable 
> and which aren't...
>
> I'm okay with an ioctl to setup MCE, but just make sure userspace has 
> all the information to know what the kernel can do rather than the 
> try-and-see-if-it-works approach.  We can publish this information via 
> KVM_CAP things, or via another ioctl (see KVM_GET_SUPPORTED_CPUID2 for 
> an example).

Why not introduce a new exit type for MSR reads/writes that aren't 
handled by the kernel?  You just need a bit on the return that indicates 
whether to GPF because of an invalid MSR access.

KVM_SET_MSRs should be reserved for MSRs that are performance 
sensitive.  Not all of them will be.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 19, 2009, 8:33 a.m. UTC | #13
Anthony Liguori wrote:
> Avi Kivity wrote:
>>
>> Then we would need to tell which read-only MSRs are setup writeable 
>> and which aren't...
>>
>> I'm okay with an ioctl to setup MCE, but just make sure userspace has 
>> all the information to know what the kernel can do rather than the 
>> try-and-see-if-it-works approach.  We can publish this information 
>> via KVM_CAP things, or via another ioctl (see 
>> KVM_GET_SUPPORTED_CPUID2 for an example).
>
> Why not introduce a new exit type for MSR reads/writes that aren't 
> handled by the kernel?  You just need a bit on the return that 
> indicates whether to GPF because of an invalid MSR access.
>
> KVM_SET_MSRs should be reserved for MSRs that are performance 
> sensitive.  Not all of them will be.
>

Right now everything in the vcpu is emulated in the kernel.  Everything 
else is emulated either in the kernel (irqchip) or in userspace.  This 
makes things easier to understand, and is more future friendly if more 
cpu features become virtualized by hardware.

While these are not compelling reasons, they at least lean the balance 
in favour of a kernel implementation.
Gerd Hoffmann April 20, 2009, 7:52 a.m. UTC | #14
On 04/19/09 10:33, Avi Kivity wrote:
> Anthony Liguori wrote:
>> Why not introduce a new exit type for MSR reads/writes that aren't
>> handled by the kernel? You just need a bit on the return that
>> indicates whether to GPF because of an invalid MSR access.

Yes, please.

> Right now everything in the vcpu is emulated in the kernel. Everything
> else is emulated either in the kernel (irqchip) or in userspace. This
> makes things easier to understand, and is more future friendly if more
> cpu features become virtualized by hardware.
>
> While these are not compelling reasons, they at least lean the balance
> in favour of a kernel implementation.

The xen pv-on-hvm drivers use an msr to indicate "please place the 
hypercall page here".  Handling that in kernel isn't an option IMHO.

cheers,
   Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 20, 2009, 8:26 a.m. UTC | #15
Gerd Hoffmann wrote:
>> Right now everything in the vcpu is emulated in the kernel. Everything
>> else is emulated either in the kernel (irqchip) or in userspace. This
>> makes things easier to understand, and is more future friendly if more
>> cpu features become virtualized by hardware.
>>
>> While these are not compelling reasons, they at least lean the balance
>> in favour of a kernel implementation.
>
> The xen pv-on-hvm drivers use an msr to indicate "please place the 
> hypercall page here".  Handling that in kernel isn't an option IMHO.

The contents of the hypercall page are vendor specific.  This can be 
handled from userspace (though ideally we'd abstract the cpu vendor 
away).  The Hyper-V hypercall page is more problematic, as it's 
specified to be an overlay; the page doesn't have to exist in guest RAM.
Gerd Hoffmann April 20, 2009, 8:59 a.m. UTC | #16
On 04/20/09 10:26, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>> The xen pv-on-hvm drivers use an msr to indicate "please place the
>> hypercall page here". Handling that in kernel isn't an option IMHO.
>
> The contents of the hypercall page are vendor specific. This can be
> handled from userspace (though ideally we'd abstract the cpu vendor
> away).

Well, xenner doesn't do vmcalls, so the page isn't vendor specific.  It 
looks different for 32bit / 64bit guests though.  And it actually can be 
multiple pages (with one msr write per page).  So the interface for 
in-kernel handling would be more complex than "here is a hypercall page 
for you".

 > The Hyper-V hypercall page is more problematic, as it's specified to 
 > be an overlay; the page doesn't have to exist in guest RAM.

In userspace it should be easy to handle though as qemu can just create 
a new memory slot, right?

cheers,
   Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 20, 2009, 9:05 a.m. UTC | #17
Gerd Hoffmann wrote:
> On 04/20/09 10:26, Avi Kivity wrote:
>> Gerd Hoffmann wrote:
>>> The xen pv-on-hvm drivers use an msr to indicate "please place the
>>> hypercall page here". Handling that in kernel isn't an option IMHO.
>>
>> The contents of the hypercall page are vendor specific. This can be
>> handled from userspace (though ideally we'd abstract the cpu vendor
>> away).
>
> Well, xenner doesn't do vmcalls, so the page isn't vendor specific.  

Well, for true pv (not pv-on-hvm) it wouldn't use the MSR, would it?

> It looks different for 32bit / 64bit guests though.  And it actually 
> can be multiple pages (with one msr write per page).  So the interface 
> for in-kernel handling would be more complex than "here is a hypercall 
> page for you".

To different MSRs, or multiple writes to the same MSR?

>
> > The Hyper-V hypercall page is more problematic, as it's specified to 
> > be an overlay; the page doesn't have to exist in guest RAM.
>
> In userspace it should be easy to handle though as qemu can just 
> create a new memory slot, right?

It depends if the MSR may be considered global, or is required to be 
per-cpu.  Need to refresh my memory on this, but I remember reading this 
and saying 'yuck'.
Andi Kleen April 20, 2009, 10:04 a.m. UTC | #18
> It depends if the MSR may be considered global, or is required to be 
> per-cpu.  Need to refresh my memory on this, but I remember reading this 

Machine Check MSRs need to be per CPU. The latest kernels have some support
for "shared banks" (between CPUs) but it's better to not have it and
older OS don't like it.

-Andi
Avi Kivity April 20, 2009, 11:02 a.m. UTC | #19
Andi Kleen wrote:
>> It depends if the MSR may be considered global, or is required to be 
>> per-cpu.  Need to refresh my memory on this, but I remember reading this 
>>     
>
> Machine Check MSRs need to be per CPU. The latest kernels have some support
> for "shared banks" (between CPUs) but it's better to not have it and
> older OS don't like it.
>   

In the case of the hypercall page MSR, it is global and shared among all 
processors.
Gerd Hoffmann April 20, 2009, 11:23 a.m. UTC | #20
On 04/20/09 11:05, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>> On 04/20/09 10:26, Avi Kivity wrote:
>>> Gerd Hoffmann wrote:
>>>> The xen pv-on-hvm drivers use an msr to indicate "please place the
>>>> hypercall page here". Handling that in kernel isn't an option IMHO.
>>>
>>> The contents of the hypercall page are vendor specific. This can be
>>> handled from userspace (though ideally we'd abstract the cpu vendor
>>> away).
>>
>> Well, xenner doesn't do vmcalls, so the page isn't vendor specific.
>
> Well, for true pv (not pv-on-hvm) it wouldn't use the MSR, would it?

Yes, the MSR is used for pv-on-hvm only.

>> It looks different for 32bit / 64bit guests though. And it actually
>> can be multiple pages (with one msr write per page). So the interface
>> for in-kernel handling would be more complex than "here is a hypercall
>> page for you".
>
> To different MSRs, or multiple writes to the same MSR?

Same MSR, multiple writes (page number in the low bits).

cheers,
   Gerd

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 20, 2009, 11:27 a.m. UTC | #21
Gerd Hoffmann wrote:
>>> Well, xenner doesn't do vmcalls, so the page isn't vendor specific.
>>
>> Well, for true pv (not pv-on-hvm) it wouldn't use the MSR, would it?
>
> Yes, the MSR is used for pv-on-hvm only.

So it isn't relevant for Xenner?

That said, I'd like to be able to emulate the Xen HVM hypercalls.  But 
in any case, they hypercall implementation has to be in the kernel, so I 
don't see why the MSR shouldn't be.  Especially if we need to support 
tricky bits like continuations.

> Same MSR, multiple writes (page number in the low bits).

Nasty.  The hypervisor has to remember all of the pages, so it can 
update them for live migration.  And it has to forget about them on 
reset.  And something needs to be done for kexec...
Gerd Hoffmann April 20, 2009, 12:20 p.m. UTC | #22
On 04/20/09 13:27, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>>>> Well, xenner doesn't do vmcalls, so the page isn't vendor specific.
>>>
>>> Well, for true pv (not pv-on-hvm) it wouldn't use the MSR, would it?
>>
>> Yes, the MSR is used for pv-on-hvm only.
>
> So it isn't relevant for Xenner?

It is.  I still plan to merge xenner into qemu, and also support 
xenstyle pv-on-hvm drivers.

> That said, I'd like to be able to emulate the Xen HVM hypercalls. But in
> any case, they hypercall implementation has to be in the kernel,

No.  With Xenner the xen hypercall emulation code lives in guest address 
space.

> so I
> don't see why the MSR shouldn't be.

I don't care that much, but /me thinks it would be easier to handle in 
userspace ...

> Especially if we need to support
> tricky bits like continuations.

Is there any reason to?  I *think* xen does it for better scheduling 
latency.  But with xen emulation sitting in guest address space we can 
schedule the guest at will anyway.

>> Same MSR, multiple writes (page number in the low bits).
>
> Nasty. The hypervisor has to remember all of the pages, so it can update
> them for live migration.

Xenner doesn't need update-on-migration, so there is no need at all to 
remember this.  At the end of the day it is just memcpy(guest, data, 
PAGESIZE) triggered by wrmsr.

cheers,
   Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 20, 2009, 12:43 p.m. UTC | #23
Gerd Hoffmann wrote:
>> That said, I'd like to be able to emulate the Xen HVM hypercalls. But in
>> any case, they hypercall implementation has to be in the kernel,
>
> No.  With Xenner the xen hypercall emulation code lives in guest 
> address space.

In this case the guest ring-0 code should trap the #GP, and install the 
hypercall page (which uses sysenter/syscall?).  No kvm or qemu changes 
needed.

>> Especially if we need to support
>> tricky bits like continuations.
>
> Is there any reason to?  I *think* xen does it for better scheduling 
> latency.  But with xen emulation sitting in guest address space we can 
> schedule the guest at will anyway.

It also improves latency within the guest itself.  At least I think that 
what was the Hyper-V spec is saying.  You can interrupt the execution of 
a long hypercall, inject and interrupt, and resume.  Sort of like a 
rep/movs instruction, which the cpu can and will interrupt.

>>> Same MSR, multiple writes (page number in the low bits).
>>
>> Nasty. The hypervisor has to remember all of the pages, so it can update
>> them for live migration.
>
> Xenner doesn't need update-on-migration, so there is no need at all to 
> remember this.  At the end of the day it is just memcpy(guest, data, 
> PAGESIZE) triggered by wrmsr. 

For Xenner, no (and you don't need to intercept the msr at all), but for 
pv-on-hvm, you do need to update the code.
Gerd Hoffmann April 20, 2009, 1:24 p.m. UTC | #24
On 04/20/09 14:43, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>>> That said, I'd like to be able to emulate the Xen HVM hypercalls. But in
>>> any case, they hypercall implementation has to be in the kernel,
>>
>> No. With Xenner the xen hypercall emulation code lives in guest
>> address space.
>
> In this case the guest ring-0 code should trap the #GP, and install the
> hypercall page (which uses sysenter/syscall?). No kvm or qemu changes
> needed.

Doesn't fly.

Reason #1: In the pv-on-hvm case the guest runs on ring0.
Reason #2: Chicken-egg issue:  For the pv-on-hvm case only few,
            simple hypercalls are needed.  The code to handle them
            is small enougth that it can be loaded directly into the
            hypercall page(s).

pure-pv doesn't need it in the first place.  But, yes, there I could 
simply trap #GP because the guest kernel runs on ring #1 (or #3 on 64bit).

>>> Especially if we need to support
>>> tricky bits like continuations.
>>
>> Is there any reason to? I *think* xen does it for better scheduling
>> latency. But with xen emulation sitting in guest address space we can
>> schedule the guest at will anyway.
>
> It also improves latency within the guest itself. At least I think that
> what was the Hyper-V spec is saying. You can interrupt the execution of
> a long hypercall, inject and interrupt, and resume. Sort of like a
> rep/movs instruction, which the cpu can and will interrupt.

Hmm.  Needs investigation..  I'd expect the main source of latencies is 
page table walking.  Xen works very different from kvm+xenner here ...

> For Xenner, no (and you don't need to intercept the msr at all),  but for
> pv-on-hvm, you do need to update the code.

Xenner handling pv-on-hvm doesn't need code updates either.  Real Xen 
does as it uses vmcall, not sure how they handle migration.

cheers
   Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 20, 2009, 1:45 p.m. UTC | #25
Gerd Hoffmann wrote:
> On 04/20/09 14:43, Avi Kivity wrote:
>> Gerd Hoffmann wrote:
>>>> That said, I'd like to be able to emulate the Xen HVM hypercalls. 
>>>> But in
>>>> any case, they hypercall implementation has to be in the kernel,
>>>
>>> No. With Xenner the xen hypercall emulation code lives in guest
>>> address space.
>>
>> In this case the guest ring-0 code should trap the #GP, and install the
>> hypercall page (which uses sysenter/syscall?). No kvm or qemu changes
>> needed.
>
> Doesn't fly.
>
> Reason #1: In the pv-on-hvm case the guest runs on ring0.

Sure, in this case you need to trap the MSR in the kernel (or qemu).  
But the handler is no longer in the guest address space, and you do need 
to update the opcode.

Let's not confuse the two cases.

> Reason #2: Chicken-egg issue:  For the pv-on-hvm case only few,
>            simple hypercalls are needed.  The code to handle them
>            is small enougth that it can be loaded directly into the
>            hypercall page(s).

Please elaborate.  What hypercalls are so simple that an exit into the 
hypervisor is not necessary?

>>> Is there any reason to? I *think* xen does it for better scheduling
>>> latency. But with xen emulation sitting in guest address space we can
>>> schedule the guest at will anyway.
>>
>> It also improves latency within the guest itself. At least I think that
>> what was the Hyper-V spec is saying. You can interrupt the execution of
>> a long hypercall, inject and interrupt, and resume. Sort of like a
>> rep/movs instruction, which the cpu can and will interrupt.
>
> Hmm.  Needs investigation..  I'd expect the main source of latencies 
> is page table walking.  Xen works very different from kvm+xenner here ...

kvm is mostly O(1).  We need to limit rmap chains, but we're fairly 
close.  The kvm paravirt mmu calls are not O(1), but we can easily use 
continuations there (and they're disabled on newer processors anyway).

Another area that worries me is virtio notification, which can take a 
long time.  It won't be trivial, but we can make work:

- for the existing pio-to-userspace notification, add a bit that tells 
the kernel to repeat the instruction instead of continuing.  the 'outl' 
instruction is idempotent, so we can do partial work, and return to the 
kernel.
- if using hypercallfd/piofd to a pipe, we're offloading everything to 
another thread anyway, so we can return immediately
- if using hypercallfd/piofd to a kernel virtio server, it can return 0 
bytes written, indicating it needs a retry.  kvm can try to inject an 
interrupt if it sees this.
Gerd Hoffmann April 21, 2009, 9:14 a.m. UTC | #26
On 04/20/09 15:45, Avi Kivity wrote:

> Please elaborate. What hypercalls are so simple that an exit into the
> hypervisor is not necessary?

Ok, that becomes a longer story.  I try to keep it short though ...


xenner today (pure-pv only)
===========================

There is the xenner userspace application.  Handles start-of-day 
creation and the guest <=> host communication (well, not all of it, but 
these details are not relevant here).

There is emu.  Lives in guest address space, in the xen hypervisor 
address space hole.  Kida micro-kernel.  Handles all the hypercalls. 
Most stuff it can do internally, without leaving guest contect.  In a 
few cases it has to ask the xenner application for help.  That is the 
case for guest <-> host communication things, event channel setup for 
example.

xenner and emu talk to each other using an ioport based interface.


xenner future plans
===================

I want merge the userspace bits into qemu, so qemu can emulate xen 
guests (both tcg and kvm mode).

xenner application goes away.
emu will stay the same.
Likewise the ioport interface for emu.


xenner & pv-on-hvm
==================

Once we have all this in qemu it is just a small step to also support 
xenish pv-on-hvm drivers in qemu using the xenner emulation bits. 
Hypercalls are handled by a small pic binary loaded into the hypercall 
pages.  Loading of the binary is triggered by the msr writes discussed. 
Size of the binary is only two pages: one hypercall entry points, one 
code.  Communication path is the very same ioport interface also used by 
emu, i.e. it does *not* use vmcall and thus no opcode changes are needed 
on migration.

Hope the whole picture is more clear now ...

cheers,
   Gerd

PS: bitrotted (and IIRC also broken) code is here:
http://git.et.redhat.com/?p=qemu-kraxel.git;a=shortlog;h=refs/heads/xenner-old

Needs un-rotting once the first batch of xen patches is merged.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 21, 2009, 10:14 a.m. UTC | #27
Gerd Hoffmann wrote:
> On 04/20/09 15:45, Avi Kivity wrote:
>
>> Please elaborate. What hypercalls are so simple that an exit into the
>> hypervisor is not necessary?
>
> Ok, that becomes a longer story.  I try to keep it short though ...
>

Thanks, this is very helpful.

>
> xenner & pv-on-hvm
> ==================
>
> Once we have all this in qemu it is just a small step to also support 
> xenish pv-on-hvm drivers in qemu using the xenner emulation bits. 
> Hypercalls are handled by a small pic binary loaded into the hypercall 
> pages.  Loading of the binary is triggered by the msr writes 
> discussed. Size of the binary is only two pages: one hypercall entry 
> points, one code.  Communication path is the very same ioport 
> interface also used by emu, i.e. it does *not* use vmcall and thus no 
> opcode changes are needed on migration.
>
This gives a good case for exporting MSRs to userspace.

Can you explain the protocol used by this MSR?  How does the guest know 
how many pages to load?  How does the kernel know which type of page to 
put where?

Note that it would still be interesting to have the guest call the 
kernel, so it can kick the host kernel Xen netback driver directly 
instead of going through qemu (and the userspace netback + tap).
Gerd Hoffmann April 21, 2009, 11:41 a.m. UTC | #28
On 04/21/09 12:14, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>> xenner & pv-on-hvm
>> ==================
>>
>> Once we have all this in qemu it is just a small step to also support
>> xenish pv-on-hvm drivers in qemu using the xenner emulation bits.
>> Hypercalls are handled by a small pic binary loaded into the hypercall
>> pages. Loading of the binary is triggered by the msr writes discussed.
>> Size of the binary is only two pages: one hypercall entry points, one
>> code. Communication path is the very same ioport interface also used
>> by emu, i.e. it does *not* use vmcall and thus no opcode changes are
>> needed on migration.
>>
> This gives a good case for exporting MSRs to userspace.
>
> Can you explain the protocol used by this MSR? How does the guest know
> how many pages to load? How does the kernel know which type of page to
> put where?

Sure.

   (1) cpuid 0x40000000, check vmm signature
   (2) cpuid 0x40000002 -> returns # of pages (eax) and msr (ebx)
   (3) allocate pages (normal ram)
   (4) foreach page (wrmsr "guest physical address | pageno")

Xen uses msr 0x40000000.  Due to the msr being queried via cpuid it 
should be possible to use another one.  Modulo guest bugs of course ...

> Note that it would still be interesting to have the guest call the
> kernel, so it can kick the host kernel Xen netback driver directly
> instead of going through qemu (and the userspace netback + tap).

With the current codebase netback isn't involved at all.  Backend lives 
in qemu like virtio-net.  It is a very simple one (no GSO support, ...), 
there are tons of opportunities for optimizations.  Don't feel like 
tackeling that right now though.  My patch queue is already deep enougth 
as-is.  Also as time passes the qemu network layer tweaks for virtio-net 
should make that job easier ;)

cheers,
   Gerd

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 21, 2009, 12:27 p.m. UTC | #29
Gerd Hoffmann wrote:
>> Can you explain the protocol used by this MSR? How does the guest know
>> how many pages to load? How does the kernel know which type of page to
>> put where?
>
> Sure.
>
>   (1) cpuid 0x40000000, check vmm signature
>   (2) cpuid 0x40000002 -> returns # of pages (eax) and msr (ebx)
>   (3) allocate pages (normal ram)
>   (4) foreach page (wrmsr "guest physical address | pageno")
>
> Xen uses msr 0x40000000.  Due to the msr being queried via cpuid it 
> should be possible to use another one.  Modulo guest bugs of course ...

Is there an interface to forget a page? (if not, how do you unload a 
driver? presumably Xen remembers the page address so it can patch it)

If you load each driver, do you run this multiple times, or does the 
first driver run this once and the others reuse the same pages?

>
>> Note that it would still be interesting to have the guest call the
>> kernel, so it can kick the host kernel Xen netback driver directly
>> instead of going through qemu (and the userspace netback + tap).
>
> With the current codebase netback isn't involved at all.  Backend 
> lives in qemu like virtio-net.  It is a very simple one (no GSO 
> support, ...), there are tons of opportunities for optimizations.  
> Don't feel like tackeling that right now though.  My patch queue is 
> already deep enougth as-is.  Also as time passes the qemu network 
> layer tweaks for virtio-net should make that job easier ;)

Sure, but later on, we may want to take advantage of kernel netback.  Of 
course we'll want to keep qemu netback as well, since it's much simpler 
and can be used unprivileged.
Gerd Hoffmann April 21, 2009, 12:47 p.m. UTC | #30
On 04/21/09 14:27, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>> (1) cpuid 0x40000000, check vmm signature
>> (2) cpuid 0x40000002 -> returns # of pages (eax) and msr (ebx)
>> (3) allocate pages (normal ram)
>> (4) foreach page (wrmsr "guest physical address | pageno")
>>
>> Xen uses msr 0x40000000. Due to the msr being queried via cpuid it
>> should be possible to use another one. Modulo guest bugs of course ...
>
> Is there an interface to forget a page? (if not, how do you unload a
> driver? presumably Xen remembers the page address so it can patch it)

Not sure, have to dig into the xen code to figure.

Could be xen doesn't remember the page in the first place.  They might 
let the illegal instruction fault handler patch the opcode.  At least I 
vaguely remember some discussions about that.

Could be there isn't a interface to forgot the page.  "reboot" in xen 
land is "destroy guest, restart it".

> If you load each driver, do you run this multiple times, or does the
> first driver run this once and the others reuse the same pages?

There is one driver (xen-platform-pci) which does this once at load 
time, then provides xenish services (event channels, ...) to the actual 
frontend drivers.  So for the actual frontend drivers there isn't a big 
difference between pure-pv and pv-on-hvm.

> Sure, but later on, we may want to take advantage of kernel netback.

Agree.  But right now I have more important stuff to worry about ;)

cheers,
   Gerd

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 21, 2009, 1:33 p.m. UTC | #31
Gerd Hoffmann wrote:
> Not sure, have to dig into the xen code to figure.
>
> Could be xen doesn't remember the page in the first place.  They might 
> let the illegal instruction fault handler patch the opcode.  At least 
> I vaguely remember some discussions about that.
>
> Could be there isn't a interface to forgot the page.  "reboot" in xen 
> land is "destroy guest, restart it".

How do you unload a driver then?

What about kexec?

I'm glad we didn't do a hypercall page in kvm.

>> Sure, but later on, we may want to take advantage of kernel netback.
>
> Agree.  But right now I have more important stuff to worry about ;)

Still good to know what to expect later on.
Gerd Hoffmann April 21, 2009, 4 p.m. UTC | #32
On 04/21/09 15:33, Avi Kivity wrote:
> Gerd Hoffmann wrote:
>> Not sure, have to dig into the xen code to figure.
>>
>> Could be xen doesn't remember the page in the first place. They might
>> let the illegal instruction fault handler patch the opcode. At least I
>> vaguely remember some discussions about that.

Xen does a simple msr-triggered memcpy() as well, without keeping track 
of the page.  On a quick glimpse I can't find a opcode patching place 
either.  Hmm.

>> Could be there isn't a interface to forgot the page. "reboot" in xen
>> land is "destroy guest, restart it".
>
> How do you unload a driver then?

The linux module (xen-platform-pci) has no module_exit() ...

cheers,
   Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori April 21, 2009, 4:04 p.m. UTC | #33
Avi Kivity wrote:
> Anthony Liguori wrote:
>> Avi Kivity wrote:
>>>
>>> Then we would need to tell which read-only MSRs are setup writeable 
>>> and which aren't...
>>>
>>> I'm okay with an ioctl to setup MCE, but just make sure userspace 
>>> has all the information to know what the kernel can do rather than 
>>> the try-and-see-if-it-works approach.  We can publish this 
>>> information via KVM_CAP things, or via another ioctl (see 
>>> KVM_GET_SUPPORTED_CPUID2 for an example).
>>
>> Why not introduce a new exit type for MSR reads/writes that aren't 
>> handled by the kernel?  You just need a bit on the return that 
>> indicates whether to GPF because of an invalid MSR access.
>>
>> KVM_SET_MSRs should be reserved for MSRs that are performance 
>> sensitive.  Not all of them will be.
>>
>
> Right now everything in the vcpu is emulated in the kernel.  
> Everything else is emulated either in the kernel (irqchip) or in 
> userspace.  This makes things easier to understand, and is more future 
> friendly if more cpu features become virtualized by hardware.

Except cpuid, which is handled in userspace, sort of :-)  I think the 
same arguments apply here too.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 21, 2009, 4:15 p.m. UTC | #34
Gerd Hoffmann wrote:
> On 04/21/09 15:33, Avi Kivity wrote:
>> Gerd Hoffmann wrote:
>>> Not sure, have to dig into the xen code to figure.
>>>
>>> Could be xen doesn't remember the page in the first place. They might
>>> let the illegal instruction fault handler patch the opcode. At least I
>>> vaguely remember some discussions about that.
>
> Xen does a simple msr-triggered memcpy() as well, without keeping 
> track of the page.  On a quick glimpse I can't find a opcode patching 
> place either.  Hmm.
>

Maybe migration is cooperative, that is the host tells the guest it is 
migrated, so it can re-register the hypercall area?
Avi Kivity April 21, 2009, 4:17 p.m. UTC | #35
Anthony Liguori wrote:
>>
>> Right now everything in the vcpu is emulated in the kernel.  
>> Everything else is emulated either in the kernel (irqchip) or in 
>> userspace.  This makes things easier to understand, and is more 
>> future friendly if more cpu features become virtualized by hardware.
>
> Except cpuid, which is handled in userspace, sort of :-)  I think the 
> same arguments apply here too.

No, cpuid is handled in the kernel (exactly to have everything cpuish in 
kernel).  It is *configured* from userspace, but then so is everything.
Gerd Hoffmann April 22, 2009, 10:02 a.m. UTC | #36
On 04/21/09 18:15, Avi Kivity wrote:

> Maybe migration is cooperative,

It is.  It has to be, for the xen pv frontend drivers migration isn't 
transparent.

> that is the host tells the guest it is
> migrated, so it can re-register the hypercall area?

Yes, the guest re-does the wrmsr after migration.

cheers,
   Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@ 
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/mce.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -734,23 +735,43 @@  static int set_msr_mtrr(struct kvm_vcpu 
 	return 0;
 }
 
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+
 	switch (msr) {
-	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
-	case MSR_IA32_MC0_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-		       __func__, data);
-		break;
 	case MSR_IA32_MCG_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-			__func__, data);
+		vcpu->arch.mcg_status = data;
 		break;
 	case MSR_IA32_MCG_CTL:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-			__func__, data);
+		if (!(mcg_cap & MCG_CTL_P))
+			return 1;
+		if (data != 0 && data != ~(u64)0)
+			return -1;
+		vcpu->arch.mcg_ctl = data;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			u32 offset = msr - MSR_IA32_MC0_CTL;
+			/* only 0 or all 1s can be written to IA32_MCi_CTL */
+			if ((offset & 0x3) == 0 &&
+			    data != 0 && data != ~(u64)0)
+				return -1;
+			vcpu->arch.mce_banks[offset] = data;
+			break;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	switch (msr) {
+	case MSR_EFER:
+		set_efer(vcpu, data);
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!data) {
@@ -807,6 +828,8 @@  int kvm_set_msr_common(struct kvm_vcpu *
 		break;
 	}
 	default:
+		if (!set_msr_mce(vcpu, msr, data))
+		    break;
 		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 		return 1;
 	}
@@ -861,26 +884,49 @@  static int get_msr_mtrr(struct kvm_vcpu 
 	return 0;
 }
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data;
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
 
 	switch (msr) {
-	case 0xc0010010: /* SYSCFG */
-	case 0xc0010015: /* HWCR */
-	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
-	case MSR_IA32_MC0_CTL:
-	case MSR_IA32_MCG_STATUS:
+		data = 0;
+		break;
 	case MSR_IA32_MCG_CAP:
+		data = vcpu->arch.mcg_cap;
+		break;
 	case MSR_IA32_MCG_CTL:
-	case MSR_IA32_MC0_MISC:
-	case MSR_IA32_MC0_MISC+4:
-	case MSR_IA32_MC0_MISC+8:
-	case MSR_IA32_MC0_MISC+12:
-	case MSR_IA32_MC0_MISC+16:
-	case MSR_IA32_MC0_MISC+20:
+		if (!(mcg_cap & MCG_CTL_P))
+			return 1;
+		data = vcpu->arch.mcg_ctl;
+		break;
+	case MSR_IA32_MCG_STATUS:
+		data = vcpu->arch.mcg_status;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			u32 offset = msr - MSR_IA32_MC0_CTL;
+			data = vcpu->arch.mce_banks[offset];
+			break;
+		}
+		return 1;
+	}
+	*pdata = data;
+	return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	u64 data;
+
+	switch (msr) {
+	case 0xc0010010: /* SYSCFG */
+	case 0xc0010015: /* HWCR */
+	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
@@ -921,6 +967,8 @@  int kvm_get_msr_common(struct kvm_vcpu *
 		data = vcpu->arch.time;
 		break;
 	default:
+		if (!get_msr_mce(vcpu, msr, &data))
+			break;
 		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 	}
@@ -1443,6 +1491,87 @@  static int vcpu_ioctl_tpr_access_reporti
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+					u64 mcg_cap)
+{
+	int r;
+	unsigned bank_num = mcg_cap & 0xff, bank;
+	u64 *banks;
+
+	r = -EINVAL;
+	if (!bank_num)
+		goto out;
+	r = -ENOMEM;
+	banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL);
+	if (!banks)
+		goto out;
+	r = 0;
+	vcpu->arch.mce_banks = banks;
+	vcpu->arch.mcg_cap = mcg_cap;
+	/* Init IA32_MCG_CTL to all 1s */
+	if (mcg_cap & MCG_CTL_P)
+		vcpu->arch.mcg_ctl = ~(u64)0;
+	/* Init IA32_MCi_CTL to all 1s */
+	for (bank = 0; bank < bank_num; bank++)
+		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+				      struct kvm_x86_mce *mce)
+{
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+	u64 *banks = vcpu->arch.mce_banks;
+
+	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+		return -EINVAL;
+	/*
+	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
+	 * reporting is disabled
+	 */
+	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+	    vcpu->arch.mcg_ctl != ~(u64)0)
+		return 0;
+	banks += 4 * mce->bank;
+	/*
+	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
+	 * reporting is disabled for the bank
+	 */
+	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+		return 0;
+	if (mce->status & MCI_STATUS_UC) {
+		u64 status = mce->status;
+		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+		    !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+			printk(KERN_DEBUG "kvm: set_mce: "
+			       "injects mce exception while "
+			       "previous one is in progress!\n");
+			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+			return 0;
+		}
+		if (banks[1] & MCI_STATUS_VAL)
+			status |= MCI_STATUS_OVER;
+		banks[1] = mce->status;
+		banks[2] = mce->addr;
+		banks[3] = mce->misc;
+		vcpu->arch.mcg_status = mce->mcg_status;
+		kvm_queue_exception(vcpu, MC_VECTOR);
+	} else if (!(banks[1] & MCI_STATUS_VAL) ||
+		   (!(banks[1] & MCI_STATUS_UC) &&
+		    !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
+		u64 status = mce->status;
+		if (banks[1] & MCI_STATUS_VAL)
+			status |= MCI_STATUS_OVER;
+		banks[1] = mce->status;
+		banks[2] = mce->addr;
+		banks[3] = mce->misc;
+	} else
+		banks[1] |= MCI_STATUS_OVER;
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -1576,6 +1705,31 @@  long kvm_arch_vcpu_ioctl(struct file *fi
 		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
 		break;
 	}
+	case KVM_X86_SETUP_MCE: {
+		u64 mcg_cap;
+
+		r = -EFAULT;
+		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+			goto out;
+		/*
+		 * extended machine-check state registers and CMCI are
+		 * not supported.
+		 */
+		mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
+		if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
+			goto out;
+		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+		break;
+	}
+	case KVM_X86_SET_MCE: {
+		struct kvm_x86_mce mce;
+
+		r = -EFAULT;
+		if (copy_from_user(&mce, argp, sizeof mce))
+			goto out;
+		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -371,6 +371,11 @@  struct kvm_vcpu_arch {
 	unsigned long dr6;
 	unsigned long dr7;
 	unsigned long eff_db[KVM_NR_DB_REGS];
+
+	u64 mcg_cap;
+	u64 mcg_status;
+	u64 mcg_ctl;
+	u64 *mce_banks;
 };
 
 struct kvm_mem_alias {
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -292,6 +292,18 @@  struct kvm_guest_debug {
 	struct kvm_guest_debug_arch arch;
 };
 
+/* x86 MCE */
+struct kvm_x86_mce {
+	__u64 status;
+	__u64 addr;
+	__u64 misc;
+	__u64 mcg_status;
+	__u8 bank;
+	__u8 pad1;
+	__u16 pad2;
+	__u32 pad3;
+};
+
 #define KVM_TRC_SHIFT           16
 /*
  * kvm trace categories
@@ -528,6 +540,9 @@  struct kvm_irq_routing {
 #define KVM_NMI                   _IO(KVMIO,  0x9a)
 /* Available with KVM_CAP_SET_GUEST_DEBUG */
 #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE           _IOWR(KVMIO,  0x9a, __u64)
+#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9b, struct kvm_x86_mce)
 
 /*
  * Deprecated interfaces
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -13,6 +13,7 @@ 
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
 #define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
 #define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
+#define MCG_TES_P	 (1ULL<<11)  /* Threshold-based error status */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */