diff mbox

[v2] KVM: nVMX: Fully support of nested VMX preemption timer

Message ID 1377444390-4609-1-git-send-email-yzt356@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Arthur Chunqi Li Aug. 25, 2013, 3:26 p.m. UTC
This patch contains the following two changes:
1. Fix the bug in nested preemption timer support. If vmexit L2->L0
with some reasons not emulated by L1, preemption timer value should
be save in such exits.
2. Add support of "Save VMX-preemption timer value" VM-Exit controls
to nVMX.

With this patch, nested VMX preemption timer features are fully
supported.

Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
---
 arch/x86/kvm/vmx.c |   49 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 5 deletions(-)

Comments

Jan Kiszka Aug. 26, 2013, 7:23 a.m. UTC | #1
On 2013-08-25 17:26, Arthur Chunqi Li wrote:
> This patch contains the following two changes:
> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> with some reasons not emulated by L1, preemption timer value should
> be save in such exits.
> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> to nVMX.
> 
> With this patch, nested VMX preemption timer features are fully
> supported.
> 
> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> ---
>  arch/x86/kvm/vmx.c |   49 ++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 44 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 57b4e12..6aa320e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2204,7 +2204,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>  #ifdef CONFIG_X86_64
>  		VM_EXIT_HOST_ADDR_SPACE_SIZE |
>  #endif
> -		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> +		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER))
> +		nested_vmx_exit_ctls_high &=
> +			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> +	if (!(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> +		nested_vmx_pinbased_ctls_high &=
> +			(~PIN_BASED_VMX_PREEMPTION_TIMER);
>  	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>  				      VM_EXIT_LOAD_IA32_EFER);
>  
> @@ -6706,6 +6713,22 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>  	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>  }
>  
> +static void nested_fix_preempt(struct kvm_vcpu *vcpu)

nested_adjust_preemption_timer - just "preempt" can be misleading.

> +{
> +	u64 delta_guest_tsc;
> +	u32 preempt_val, preempt_bit, delta_preempt_val;
> +
> +	preempt_bit = native_read_msr(MSR_IA32_VMX_MISC) & 0x1F;

This is rather preemption_timer_scale. And if there is no symbolic value
for the bitmask, please introduce one.

> +	delta_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
> +			native_read_tsc()) - vcpu->arch.last_guest_tsc;
> +	delta_preempt_val = delta_guest_tsc >> preempt_bit;
> +	preempt_val = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +	if (preempt_val - delta_preempt_val < 0)
> +		preempt_val = 0;
> +	else
> +		preempt_val -= delta_preempt_val;
> +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val);

The rest unfortunately wrong. It has to be split into two parts: Part
one, the calculation of L1's TSC value and its storing in nested_vmx,
has to be done on vmexit. Part two, reading the current TSC, calculating
the time spent in L0 and converting it into L1 TSC time, this has to be
done right before vmentry of L2.

Arthur, please make sure that your test case detects the current
breakage of preemption timer emulation properly, both /wrt to missing
save/restore and also regarding missing L0 time compensation, and then
check that your KVM patch fixes it based on the unit test results.

Jan

> +}
>  /*
>   * The guest has exited.  See if we can fix it or if we need userspace
>   * assistance.
> @@ -6734,9 +6757,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  	else
>  		vmx->nested.nested_run_pending = 0;
>  
> -	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> -		nested_vmx_vmexit(vcpu);
> -		return 1;
> +	if (is_guest_mode(vcpu)) {
> +		if (nested_vmx_exit_handled(vcpu)) {
> +			nested_vmx_vmexit(vcpu);
> +			return 1;
> +		} else
> +			nested_fix_preempt(vcpu);
>  	}
>  
>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
> @@ -7517,6 +7543,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 exec_control;
> +	u32 exit_control;
>  
>  	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>  	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
> @@ -7690,7 +7717,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>  	 * bits are further modified by vmx_set_efer() below.
>  	 */
> -	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> +	exit_control = vmcs_config.vmexit_ctrl;
> +	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
> +		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>  
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>  	 * emulated by vmx_set_efer(), below.
> @@ -8089,6 +8119,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	vmcs12->guest_pending_dbg_exceptions =
>  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>  
> +	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
> +		if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> +			vmcs12->vmx_preemption_timer_value =
> +				vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +		else
> +			vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> +					vmcs12->vmx_preemption_timer_value);
> +	}
> +
>  	/*
>  	 * In some cases (usually, nested EPT), L2 is allowed to change its
>  	 * own CR3 without exiting. If it has changed it, we must keep it.
>
Arthur Chunqi Li Aug. 26, 2013, 9:05 a.m. UTC | #2
On Mon, Aug 26, 2013 at 3:23 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2013-08-25 17:26, Arthur Chunqi Li wrote:
>> This patch contains the following two changes:
>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> with some reasons not emulated by L1, preemption timer value should
>> be save in such exits.
>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> to nVMX.
>>
>> With this patch, nested VMX preemption timer features are fully
>> supported.
>>
>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> ---
>>  arch/x86/kvm/vmx.c |   49 ++++++++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 44 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 57b4e12..6aa320e 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2204,7 +2204,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>  #ifdef CONFIG_X86_64
>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>  #endif
>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER))
>> +             nested_vmx_exit_ctls_high &=
>> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> +     if (!(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> +             nested_vmx_pinbased_ctls_high &=
>> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>                                     VM_EXIT_LOAD_IA32_EFER);
>>
>> @@ -6706,6 +6713,22 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>>  }
>>
>> +static void nested_fix_preempt(struct kvm_vcpu *vcpu)
>
> nested_adjust_preemption_timer - just "preempt" can be misleading.
>
>> +{
>> +     u64 delta_guest_tsc;
>> +     u32 preempt_val, preempt_bit, delta_preempt_val;
>> +
>> +     preempt_bit = native_read_msr(MSR_IA32_VMX_MISC) & 0x1F;
>
> This is rather preemption_timer_scale. And if there is no symbolic value
> for the bitmask, please introduce one.
>
>> +     delta_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
>> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
>> +     delta_preempt_val = delta_guest_tsc >> preempt_bit;
>> +     preempt_val = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +     if (preempt_val - delta_preempt_val < 0)
>> +             preempt_val = 0;
>> +     else
>> +             preempt_val -= delta_preempt_val;
>> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val);
>
> The rest unfortunately wrong. It has to be split into two parts: Part
> one, the calculation of L1's TSC value and its storing in nested_vmx,
> has to be done on vmexit. Part two, reading the current TSC, calculating
> the time spent in L0 and converting it into L1 TSC time, this has to be
> done right before vmentry of L2.
As what we discussed yesterday, the calculation of L1's TSC value is
not saved in nested_vmx, however, to avoid adding codes to the hot
patch of vmexit. Instead, we use vcpu->arch.last_guest_tsc as the
value stored on vmexit (which has been done already). And the value of
"part two" is calculated in nested_fix_preempt() above (see variant
delta_guest_tsc, which stores the consumed TSC value in L0). Since
vmx_handle_exit is the last function called in vmexit path, I think
it's OK to put "part two" here.
>
> Arthur, please make sure that your test case detects the current
> breakage of preemption timer emulation properly, both /wrt to missing
> save/restore and also regarding missing L0 time compensation, and then
> check that your KVM patch fixes it based on the unit test results.
OK, I will commit a patch of kvm-unit-tests to test these changes.

Arthur
>
> Jan
>
>> +}
>>  /*
>>   * The guest has exited.  See if we can fix it or if we need userspace
>>   * assistance.
>> @@ -6734,9 +6757,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>       else
>>               vmx->nested.nested_run_pending = 0;
>>
>> -     if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
>> -             nested_vmx_vmexit(vcpu);
>> -             return 1;
>> +     if (is_guest_mode(vcpu)) {
>> +             if (nested_vmx_exit_handled(vcpu)) {
>> +                     nested_vmx_vmexit(vcpu);
>> +                     return 1;
>> +             } else
>> +                     nested_fix_preempt(vcpu);
>>       }
>>
>>       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>> @@ -7517,6 +7543,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  {
>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>       u32 exec_control;
>> +     u32 exit_control;
>>
>>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
>> @@ -7690,7 +7717,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>        * bits are further modified by vmx_set_efer() below.
>>        */
>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> +     exit_control = vmcs_config.vmexit_ctrl;
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>> +             exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>>
>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>        * emulated by vmx_set_efer(), below.
>> @@ -8089,6 +8119,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>       vmcs12->guest_pending_dbg_exceptions =
>>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
>> +             if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> +                     vmcs12->vmx_preemption_timer_value =
>> +                             vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +             else
>> +                     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> +                                     vmcs12->vmx_preemption_timer_value);
>> +     }
>> +
>>       /*
>>        * In some cases (usually, nested EPT), L2 is allowed to change its
>>        * own CR3 without exiting. If it has changed it, we must keep it.
>>
>
>
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 57b4e12..6aa320e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2204,7 +2204,14 @@  static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+	if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER))
+		nested_vmx_exit_ctls_high &=
+			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+	if (!(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+		nested_vmx_pinbased_ctls_high &=
+			(~PIN_BASED_VMX_PREEMPTION_TIMER);
 	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 				      VM_EXIT_LOAD_IA32_EFER);
 
@@ -6706,6 +6713,22 @@  static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static void nested_fix_preempt(struct kvm_vcpu *vcpu)
+{
+	u64 delta_guest_tsc;
+	u32 preempt_val, preempt_bit, delta_preempt_val;
+
+	preempt_bit = native_read_msr(MSR_IA32_VMX_MISC) & 0x1F;
+	delta_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+			native_read_tsc()) - vcpu->arch.last_guest_tsc;
+	delta_preempt_val = delta_guest_tsc >> preempt_bit;
+	preempt_val = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+	if (preempt_val - delta_preempt_val < 0)
+		preempt_val = 0;
+	else
+		preempt_val -= delta_preempt_val;
+	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val);
+}
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -6734,9 +6757,12 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	else
 		vmx->nested.nested_run_pending = 0;
 
-	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-		nested_vmx_vmexit(vcpu);
-		return 1;
+	if (is_guest_mode(vcpu)) {
+		if (nested_vmx_exit_handled(vcpu)) {
+			nested_vmx_vmexit(vcpu);
+			return 1;
+		} else
+			nested_fix_preempt(vcpu);
 	}
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
@@ -7517,6 +7543,7 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
+	u32 exit_control;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7690,7 +7717,10 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
 	 * bits are further modified by vmx_set_efer() below.
 	 */
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+	exit_control = vmcs_config.vmexit_ctrl;
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
 
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
@@ -8089,6 +8119,15 @@  static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
+		if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+			vmcs12->vmx_preemption_timer_value =
+				vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+		else
+			vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
+					vmcs12->vmx_preemption_timer_value);
+	}
+
 	/*
 	 * In some cases (usually, nested EPT), L2 is allowed to change its
 	 * own CR3 without exiting. If it has changed it, we must keep it.