diff mbox

VMX Unrestricted mode support

Message ID 1243552292.25456.23.camel@mukti.sc.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nitin A Kamble May 28, 2009, 11:11 p.m. UTC
Avi,

A new VMX feature "Unrestricted Guest" feature is added in the VMX
specification. You can look at the latest Intel processor manual for
details of the feature here:

 http://www.intel.com/products/processor/manuals

    It allows kvm guests to run real mode and unpaged mode
code natively in the VMX mode when EPT is turned on. With the
unrestricted guest there is no need to emulate the guest real mode code
in the vm86 container or in the emulator. Also the guest big real mode
code works like native. 

  The attached patch enhances KVM to use the unrestricted guest feature
if available on the processor. It also adds a new kernel/module
parameter to disable the unrestricted guest feature at the boot time. 

Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>

Thanks & Regards,
Nitin


 
@@ -268,6 +272,12 @@ static inline int cpu_has_vmx_ept(void)
 		SECONDARY_EXEC_ENABLE_EPT;
 }
 
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return flexpriority_enabled &&
@@ -731,7 +741,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu
*vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	if (vcpu->arch.rmode.active)
+	if (vcpu->arch.rmode.active && !enable_unrestricted_guest)
 		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
@@ -1195,7 +1205,8 @@ static __init int setup_vmcs_config(struct
vmcs_config *vmcs_conf)
 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 			SECONDARY_EXEC_WBINVD_EXITING |
 			SECONDARY_EXEC_ENABLE_VPID |
-			SECONDARY_EXEC_ENABLE_EPT;
+			SECONDARY_EXEC_ENABLE_EPT |
+			SECONDARY_EXEC_UNRESTRICTED_GUEST;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -1325,8 +1336,13 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 
-	if (!cpu_has_vmx_ept())
+	if (!cpu_has_vmx_ept()) {
 		enable_ept = 0;
+		enable_unrestricted_guest = 0;
+	}
+
+	if (!cpu_has_vmx_unrestricted_guest())
+		enable_unrestricted_guest = 0;
 
 	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
@@ -1363,9 +1379,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	unsigned long flags;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 0;
 
+	update_exception_bitmap(vcpu);
+
+	if (enable_unrestricted_guest) {
+		vmx->emulation_required = 0;
+		return;
+	}
+
+	vmx->emulation_required = 1;
+
 	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
 	vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
 	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
@@ -1378,8 +1402,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
 			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
 
-	update_exception_bitmap(vcpu);
-
 	if (emulate_invalid_guest_state)
 		return;
 
@@ -1425,8 +1447,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	unsigned long flags;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 1;
+	update_exception_bitmap(vcpu);
+
+	if (enable_unrestricted_guest) {
+		vmx->emulation_required = 0;
+		goto continue_rmode;
+	}
+
+	vmx->emulation_required = 1;
 
 	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
 	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1445,7 +1474,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
 	vmcs_writel(GUEST_RFLAGS, flags);
 	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
-	update_exception_bitmap(vcpu);
 
 	if (emulate_invalid_guest_state)
 		goto continue_rmode;
@@ -1563,7 +1591,6 @@ static void ept_update_paging_mode_cr0(unsigned
long *hw_cr0,
 			      CPU_BASED_CR3_STORE_EXITING));
 		vcpu->arch.cr0 = cr0;
 		vmx_set_cr4(vcpu, vcpu->arch.cr4);
-		*hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
 		*hw_cr0 &= ~X86_CR0_WP;
 	} else if (!is_paging(vcpu)) {
 		/* From nonpaging to paging */
@@ -1733,6 +1760,23 @@ static u32 vmx_segment_access_rights(struct
kvm_segment *var)
 	return ar;
 }
 
+static inline u32 get_segment_ar(int seg)
+{
+	if (!enable_unrestricted_guest)
+		return 0xf3;
+
+	switch (seg) {
+	case VCPU_SREG_CS:
+		return 0x9b;
+	case VCPU_SREG_TR:
+		return 0x8b;
+	case VCPU_SREG_LDTR:
+		return 0x82;
+	default:
+		return 0x93;
+	}
+}
+
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg)
 {
@@ -1755,7 +1799,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 		 */
 		if (var->base == 0xffff0000 && var->selector == 0xf000)
 			vmcs_writel(sf->base, 0xf0000);
-		ar = 0xf3;
+		ar = get_segment_ar(seg);
 	} else
 		ar = vmx_segment_access_rights(var);
 	vmcs_write32(sf->ar_bytes, ar);
@@ -2058,7 +2102,7 @@ static void seg_setup(int seg)
 	vmcs_write16(sf->selector, 0);
 	vmcs_writel(sf->base, 0);
 	vmcs_write32(sf->limit, 0xffff);
-	vmcs_write32(sf->ar_bytes, 0xf3);
+	vmcs_write32(sf->ar_bytes, get_segment_ar(seg));
 }
 
 static int alloc_apic_access_page(struct kvm *kvm)
@@ -2201,6 +2245,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 		if (!enable_ept)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		if (!enable_unrestricted_guest)
+			exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}

Comments

Alexey Eremenko May 28, 2009, 11:39 p.m. UTC | #1
VMX Unrestricted mode support -- looks like very interesting (and
useful !) feature.

Which CPUs support it ?

Core i7 900-series (Nehalem) ?
Nitin A Kamble May 29, 2009, 4:04 a.m. UTC | #2
Unrestricted guest features is introduced in Westmere processor.
Westmere is next to nehalem. And all the following processors will
support it.  Westmere would be the 1st processor being built on the 32nm
process.

Thanks,
Nitin

On Thu, 2009-05-28 at 16:39 -0700, Alexey Eremenko wrote:
> VMX Unrestricted mode support -- looks like very interesting (and
> useful !) feature.
> 
> Which CPUs support it ?
> 
> Core i7 900-series (Nehalem) ?
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 31, 2009, 8:39 a.m. UTC | #3
Nitin A Kamble wrote:
> Avi,
>
> A new VMX feature "Unrestricted Guest" feature is added in the VMX
> specification. You can look at the latest Intel processor manual for
> details of the feature here:
>
>  http://www.intel.com/products/processor/manuals
>
>     It allows kvm guests to run real mode and unpaged mode
> code natively in the VMX mode when EPT is turned on. With the
> unrestricted guest there is no need to emulate the guest real mode code
> in the vm86 container or in the emulator. Also the guest big real mode
> code works like native. 
>
>   The attached patch enhances KVM to use the unrestricted guest feature
> if available on the processor. It also adds a new kernel/module
> parameter to disable the unrestricted guest feature at the boot time. 
>   

Thanks, this is much needed.  Review comments below.

>  #define KVM_GUEST_CR0_MASK				   \
>  	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
>  	 | X86_CR0_NW | X86_CR0_CD)
> +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST				\
> +	(X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
> +#define KVM_VM_CR0_ALWAYS_ON_RESTRICTED_GUEST				\
> +	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
>  #define KVM_VM_CR0_ALWAYS_ON						\
> -	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
> -	 | X86_CR0_MP)
> +	(enable_unrestricted_guest ? KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
> +	: KVM_VM_CR0_ALWAYS_ON_RESTRICTED_GUEST)
>   

Please avoid hiding computations in macros.  Just change the call sites.

>  static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
>  {
>  	return flexpriority_enabled &&
> @@ -731,7 +741,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu
> *vcpu)
>  
>  static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
>  {
> -	if (vcpu->arch.rmode.active)
> +	if (vcpu->arch.rmode.active && !enable_unrestricted_guest)
>  		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
>  	vmcs_writel(GUEST_RFLAGS, rflags);
>  }
>   

Instead of changing all the checks like this, you can make rmode.active 
be false when unrestricted guest is enabled.  We can interpret 
rmode.active as "emulating real mode via vm86", not as "guest is in real 
mode".

You can just have enter_rmode() exit immediately when called.

>  
> +static inline u32 get_segment_ar(int seg)
> +{
> +	if (!enable_unrestricted_guest)
> +		return 0xf3;
> +
> +	switch (seg) {
> +	case VCPU_SREG_CS:
> +		return 0x9b;
> +	case VCPU_SREG_TR:
> +		return 0x8b;
> +	case VCPU_SREG_LDTR:
> +		return 0x82;
> +	default:
> +		return 0x93;
> +	}
> +}
> +
>  static void vmx_set_segment(struct kvm_vcpu *vcpu,
>  			    struct kvm_segment *var, int seg)
>  {
> @@ -1755,7 +1799,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
>  		 */
>  		if (var->base == 0xffff0000 && var->selector == 0xf000)
>  			vmcs_writel(sf->base, 0xf0000);
> -		ar = 0xf3;
> +		ar = get_segment_ar(seg);
>   

I think this can go away if rmode.active == 0.
Anthony Liguori May 31, 2009, 3:23 p.m. UTC | #4
Nitin A Kamble wrote:
> Avi,
>
> A new VMX feature "Unrestricted Guest" feature is added in the VMX
> specification. You can look at the latest Intel processor manual for
> details of the feature here:
>
>  http://www.intel.com/products/processor/manuals
>
>     It allows kvm guests to run real mode and unpaged mode
> code natively in the VMX mode when EPT is turned on. With the
> unrestricted guest there is no need to emulate the guest real mode code
> in the vm86 container or in the emulator. Also the guest big real mode
> code works like native. 
>
>   The attached patch enhances KVM to use the unrestricted guest feature
> if available on the processor. It also adds a new kernel/module
> parameter to disable the unrestricted guest feature at the boot time. 
>
> Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>
>
> Thanks & Regards,
> Nitin
>   

It may make sense to expose this as a CAP down to userspace.

We could possibly make better error messages in userspace when we get an
unknown vmexit failure on pre-wsm hardware.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 31, 2009, 4:01 p.m. UTC | #5
Anthony Liguori wrote:
> Nitin A Kamble wrote:
>   
>> Avi,
>>
>> A new VMX feature "Unrestricted Guest" feature is added in the VMX
>> specification. You can look at the latest Intel processor manual for
>> details of the feature here:
>>
>>  http://www.intel.com/products/processor/manuals
>>
>>     It allows kvm guests to run real mode and unpaged mode
>> code natively in the VMX mode when EPT is turned on. With the
>> unrestricted guest there is no need to emulate the guest real mode code
>> in the vm86 container or in the emulator. Also the guest big real mode
>> code works like native. 
>>
>>   The attached patch enhances KVM to use the unrestricted guest feature
>> if available on the processor. It also adds a new kernel/module
>> parameter to disable the unrestricted guest feature at the boot time. 
>>
>> Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>
>>
>> Thanks & Regards,
>> Nitin
>>   
>>     
>
> It may make sense to expose this as a CAP down to userspace.
>
> We could possibly make better error messages in userspace when we get an
> unknown vmexit failure on pre-wsm hardware.
>   

If we see the dreaded 0x80000021 and CR0.PE=0, then we're likely seeing 
a real mode emulation problem.
Nitin A Kamble June 1, 2009, 6:06 p.m. UTC | #6
On Sun, 2009-05-31 at 01:39 -0700, Avi Kivity wrote:

> Thanks, this is much needed.  Review comments below.
> 

> Please avoid hiding computations in macros.  Just change the call sites.

> Instead of changing all the checks like this, you can make rmode.active 
> be false when unrestricted guest is enabled.  We can interpret 
> rmode.active as "emulating real mode via vm86", not as "guest is in real 
> mode".
> 
> You can just have enter_rmode() exit immediately when called.
> 
> I think this can go away if rmode.active == 0.
> 

Avi,
  Thanks for your comments. I will cook another patch for this feature
as per your comments. 

Thanks & Regards,
Nitin


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nitin A Kamble June 1, 2009, 6:35 p.m. UTC | #7
On Mon, 2009-06-01 at 11:06 -0700, Nitin A Kamble wrote:
> On Sun, 2009-05-31 at 01:39 -0700, Avi Kivity wrote:

> > Instead of changing all the checks like this, you can make rmode.active 
> > be false when unrestricted guest is enabled.  We can interpret 
> > rmode.active as "emulating real mode via vm86", not as "guest is in real 
> > mode".

Avi,
 How about renaming rmode.active to rmode.vm86_active ?

Thanks & Regards,
Nitin

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 1, 2009, 6:38 p.m. UTC | #8
Nitin A Kamble wrote:
> Avi,
>  How about renaming rmode.active to rmode.vm86_active ?
>   

Sure.  But if you do that, then do it in a separate patch please.
Nitin A Kamble June 3, 2009, 6:08 p.m. UTC | #9
Hi Avi,
  I find that the qemu processor reset state is not per the IA32
processor specifications. (Sections 8.1.1 of
http://www.intel.com/Assets/PDF/manual/253668.pdf)

In qemu-kvm.git in file target-i386/helper.c in function cpu_reset the
segment registers are initialized as follows:

cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
				DESC_R_MASK);
cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);

While the IA32 cpu reset state specification says that Segment Accessed
bit is also 1 at the time of cpu reset. so the above code should look
like this:

cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
                 DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
                 DESC_R_MASK | DESC_A_MASK);
cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
                 DESC_P_MASK | DESC_S_MASK | DESC_W_MASK | DESC_A_MASK);
cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
                 DESC_P_MASK | DESC_S_MASK | DESC_W_MASK| DESC_A_MASK);
cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
                 DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |DESC_A_MASK);
cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
                 DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
                 DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);

This discrepancy is adding the need of the following function in the
unrestricted guest patch.

+static inline u32 get_segment_ar(int seg)
+{
+       if (!enable_unrestricted_guest)
+               return 0xf3;
+
+       switch (seg) {
+       case VCPU_SREG_CS:
+               return 0x9b;
+       case VCPU_SREG_TR:
+               return 0x8b;
+       case VCPU_SREG_LDTR:
+               return 0x82;
+       default:
+               return 0x93;
+       }
+}
+

For the unrestricted guest support either we can fix this discrepancy in
the qemu code, or have a functionality like get_segment_ar() in the kvm
vmx code. 
  what do you suggest ?

Thanks & Regards,
Nitin
					                       

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nitin A Kamble June 3, 2009, 6:18 p.m. UTC | #10
Avi,
   I also tested the qemu change is not affecting vm86 (no unrestricted
guest) code path.

Thanks & Regards,
Nitin


On Wed, 2009-06-03 at 11:08 -0700, Nitin A Kamble wrote:
> Hi Avi,
>   I find that the qemu processor reset state is not per the IA32
> processor specifications. (Sections 8.1.1 of
> http://www.intel.com/Assets/PDF/manual/253668.pdf)
> 
> In qemu-kvm.git in file target-i386/helper.c in function cpu_reset the
> segment registers are initialized as follows:
> 
> cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
> 				DESC_R_MASK);
> cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> 
> While the IA32 cpu reset state specification says that Segment Accessed
> bit is also 1 at the time of cpu reset. so the above code should look
> like this:
> 
> cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
>                  DESC_R_MASK | DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK | DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK| DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> 
> This discrepancy is adding the need of the following function in the
> unrestricted guest patch.
> 
> +static inline u32 get_segment_ar(int seg)
> +{
> +       if (!enable_unrestricted_guest)
> +               return 0xf3;
> +
> +       switch (seg) {
> +       case VCPU_SREG_CS:
> +               return 0x9b;
> +       case VCPU_SREG_TR:
> +               return 0x8b;
> +       case VCPU_SREG_LDTR:
> +               return 0x82;
> +       default:
> +               return 0x93;
> +       }
> +}
> +
> 
> For the unrestricted guest support either we can fix this discrepancy in
> the qemu code, or have a functionality like get_segment_ar() in the kvm
> vmx code. 
>   what do you suggest ?
> 
> Thanks & Regards,
> Nitin
> 					                       
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 3, 2009, 6:20 p.m. UTC | #11
Nitin A Kamble wrote:
> Hi Avi,
>   I find that the qemu processor reset state is not per the IA32
> processor specifications. (Sections 8.1.1 of
> http://www.intel.com/Assets/PDF/manual/253668.pdf)
>
> In qemu-kvm.git in file target-i386/helper.c in function cpu_reset the
> segment registers are initialized as follows:
>
> cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
> 				DESC_R_MASK);
> cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
>
> While the IA32 cpu reset state specification says that Segment Accessed
> bit is also 1 at the time of cpu reset. so the above code should look
> like this:
>
> cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
>                  DESC_R_MASK | DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK | DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK| DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
>
> This discrepancy is adding the need of the following function in the
> unrestricted guest patch.
>
> +static inline u32 get_segment_ar(int seg)
> +{
> +       if (!enable_unrestricted_guest)
> +               return 0xf3;
> +
> +       switch (seg) {
> +       case VCPU_SREG_CS:
> +               return 0x9b;
> +       case VCPU_SREG_TR:
> +               return 0x8b;
> +       case VCPU_SREG_LDTR:
> +               return 0x82;
> +       default:
> +               return 0x93;
> +       }
> +}
> +
>
> For the unrestricted guest support either we can fix this discrepancy in
> the qemu code, or have a functionality like get_segment_ar() in the kvm
> vmx code. 
>   what do you suggest ?
>
>   

Qemu should be fixed of course, but we want kvm to keep working with 
older qemu.  So please also have KVM_SET_SREGS set the A bit on segments 
which are not unusable.
Jan Kiszka June 4, 2009, 6:13 p.m. UTC | #12
Nitin A Kamble wrote:
> Hi Avi,
>   I find that the qemu processor reset state is not per the IA32
> processor specifications. (Sections 8.1.1 of
> http://www.intel.com/Assets/PDF/manual/253668.pdf)
> 
> In qemu-kvm.git in file target-i386/helper.c in function cpu_reset the
> segment registers are initialized as follows:
> 
> cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
> 				DESC_R_MASK);
> cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
>                            DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> 
> While the IA32 cpu reset state specification says that Segment Accessed
> bit is also 1 at the time of cpu reset. so the above code should look
> like this:
> 
> cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | 
>                  DESC_R_MASK | DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK | DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK| DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |DESC_A_MASK);
> cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
>                  DESC_P_MASK | DESC_S_MASK | DESC_W_MASK);
> 
> This discrepancy is adding the need of the following function in the
> unrestricted guest patch.

As Avi already indicated: Independent of the kvm workaround for older
qemu versions, please post (to qemu-devel) a patch against upstream's
git to fix the discrepancy.

Jan
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2b082d..7832599 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,9 +40,13 @@ 
 #define KVM_GUEST_CR0_MASK				   \
 	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
 	 | X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST				\
+	(X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
+#define KVM_VM_CR0_ALWAYS_ON_RESTRICTED_GUEST				\
+	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 #define KVM_VM_CR0_ALWAYS_ON						\
-	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
-	 | X86_CR0_MP)
+	(enable_unrestricted_guest ? KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+	: KVM_VM_CR0_ALWAYS_ON_RESTRICTED_GUEST)
 #define KVM_GUEST_CR4_MASK						\
 	(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944..c73da02 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -55,6 +55,7 @@ 
 #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25f1239..703d2c4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -50,6 +50,10 @@  module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 static int __read_mostly enable_ept = 1;
 module_param_named(ept, enable_ept, bool, S_IRUGO);
 
+static int __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+			enable_unrestricted_guest, bool, S_IRUGO);
+
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
@@ -268,6 +272,12 @@  static inline int cpu_has_vmx_ept(void)
 		SECONDARY_EXEC_ENABLE_EPT;
 }
 
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return flexpriority_enabled &&
@@ -731,7 +741,7 @@  static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	if (vcpu->arch.rmode.active)
+	if (vcpu->arch.rmode.active && !enable_unrestricted_guest)
 		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
@@ -1195,7 +1205,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 			SECONDARY_EXEC_WBINVD_EXITING |
 			SECONDARY_EXEC_ENABLE_VPID |
-			SECONDARY_EXEC_ENABLE_EPT;
+			SECONDARY_EXEC_ENABLE_EPT |
+			SECONDARY_EXEC_UNRESTRICTED_GUEST;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -1325,8 +1336,13 @@  static __init int hardware_setup(void)
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 
-	if (!cpu_has_vmx_ept())
+	if (!cpu_has_vmx_ept()) {
 		enable_ept = 0;
+		enable_unrestricted_guest = 0;
+	}
+
+	if (!cpu_has_vmx_unrestricted_guest())
+		enable_unrestricted_guest = 0;
 
 	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
@@ -1363,9 +1379,17 @@  static void enter_pmode(struct kvm_vcpu *vcpu)
 	unsigned long flags;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 0;
 
+	update_exception_bitmap(vcpu);
+
+	if (enable_unrestricted_guest) {
+		vmx->emulation_required = 0;
+		return;
+	}
+
+	vmx->emulation_required = 1;
+
 	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
 	vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
 	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
@@ -1378,8 +1402,6 @@  static void enter_pmode(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
 			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
 
-	update_exception_bitmap(vcpu);
-
 	if (emulate_invalid_guest_state)
 		return;
 
@@ -1425,8 +1447,15 @@  static void enter_rmode(struct kvm_vcpu *vcpu)
 	unsigned long flags;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 1;
+	update_exception_bitmap(vcpu);
+
+	if (enable_unrestricted_guest) {
+		vmx->emulation_required = 0;
+		goto continue_rmode;
+	}
+
+	vmx->emulation_required = 1;
 
 	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
 	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1445,7 +1474,6 @@  static void enter_rmode(struct kvm_vcpu *vcpu)
 
 	vmcs_writel(GUEST_RFLAGS, flags);
 	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
-	update_exception_bitmap(vcpu);
 
 	if (emulate_invalid_guest_state)
 		goto continue_rmode;
@@ -1563,7 +1591,6 @@  static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 			      CPU_BASED_CR3_STORE_EXITING));
 		vcpu->arch.cr0 = cr0;
 		vmx_set_cr4(vcpu, vcpu->arch.cr4);
-		*hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
 		*hw_cr0 &= ~X86_CR0_WP;
 	} else if (!is_paging(vcpu)) {
 		/* From nonpaging to paging */
@@ -1733,6 +1760,23 @@  static u32 vmx_segment_access_rights(struct kvm_segment *var)
 	return ar;
 }
 
+static inline u32 get_segment_ar(int seg)
+{
+	if (!enable_unrestricted_guest)
+		return 0xf3;
+
+	switch (seg) {
+	case VCPU_SREG_CS:
+		return 0x9b;
+	case VCPU_SREG_TR:
+		return 0x8b;
+	case VCPU_SREG_LDTR:
+		return 0x82;
+	default:
+		return 0x93;
+	}
+}
+
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg)
 {
@@ -1755,7 +1799,7 @@  static void vmx_set_segment(struct kvm_vcpu *vcpu,
 		 */
 		if (var->base == 0xffff0000 && var->selector == 0xf000)
 			vmcs_writel(sf->base, 0xf0000);
-		ar = 0xf3;
+		ar = get_segment_ar(seg);
 	} else
 		ar = vmx_segment_access_rights(var);
 	vmcs_write32(sf->ar_bytes, ar);
@@ -2058,7 +2102,7 @@  static void seg_setup(int seg)
 	vmcs_write16(sf->selector, 0);
 	vmcs_writel(sf->base, 0);
 	vmcs_write32(sf->limit, 0xffff);
-	vmcs_write32(sf->ar_bytes, 0xf3);
+	vmcs_write32(sf->ar_bytes, get_segment_ar(seg));
 }
 
 static int alloc_apic_access_page(struct kvm *kvm)
@@ -2201,6 +2245,8 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 		if (!enable_ept)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		if (!enable_unrestricted_guest)
+			exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}