diff mbox

KVM: nVMX: Fully support of nested VMX preemption timer

Message ID 1377369850-18583-1-git-send-email-root@Blade1-01.Blade1-01 (mailing list archive)
State New, archived
Headers show

Commit Message

Arthur Chunqi Li Aug. 24, 2013, 6:44 p.m. UTC
This patch contains the following two changes:
1. Fix the bug in nested preemption timer support. If vmexit L2->L0
with some reasons not emulated by L1, preemption timer value should
be save in such exits.
2. Add support of "Save VMX-preemption timer value" VM-Exit controls
to nVMX.

With this patch, nested VMX preemption timer features are fully
supported.

Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
---
 arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

Comments

Jan Kiszka Aug. 25, 2013, 6:44 a.m. UTC | #1
On 2013-08-24 20:44, root wrote:
> This patch contains the following two changes:
> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> with some reasons not emulated by L1, preemption timer value should
> be save in such exits.
> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> to nVMX.
> 
> With this patch, nested VMX preemption timer features are fully
> supported.
> 
> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> ---
>  arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
>  1 file changed, 25 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 57b4e12..9579409 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2204,7 +2204,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>  #ifdef CONFIG_X86_64
>  		VM_EXIT_HOST_ADDR_SPACE_SIZE |
>  #endif
> -		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> +		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>  	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>  				      VM_EXIT_LOAD_IA32_EFER);

In the absence of VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, you need to hide
PIN_BASED_VMX_PREEMPTION_TIMER from the guest as we cannot emulate its
behavior properly in that case.

>  
> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  		(vmcs_config.pin_based_exec_ctrl |
>  		 vmcs12->pin_based_vm_exec_control));
>  
> -	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
> -		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> -			     vmcs12->vmx_preemption_timer_value);
> +	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
> +		if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> +			vmcs12->vmx_preemption_timer_value =
> +				vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +		else
> +			vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> +					vmcs12->vmx_preemption_timer_value);
> +	}

This is not correct. We still need to set the vmcs to
vmx_preemption_timer_value. The difference is that, on exit from L2,
vmx_preemption_timer_value has to be updated according to the saved
hardware state. The corresponding code is missing in your patch so far.

>  
>  	/*
>  	 * Whether page-faults are trapped is determined by a combination of
> @@ -7690,7 +7696,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>  	 * bits are further modified by vmx_set_efer() below.
>  	 */
> -	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> +	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
> +		vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl |
> +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> +	else
> +		vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);

Let's prepare the value for VM_EXIT_CONTROLS in a local variable first,
then write it to the vmcs.

>  
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>  	 * emulated by vmx_set_efer(), below.
> @@ -7912,6 +7922,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  	}
>  
>  	/*
> +	 * If L2 support PIN_BASED_VMX_PREEMPTION_TIMER, L0 must support
> +	 * VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
> +	 */
> +	if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
> +			!(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
> +		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> +		return 1;
> +	}

Nope, the guest is free to run the preemption timer without saving on
exits. It may have a valid use case for this, e.g. that it will always
reprogram it on entry.

> +
> +	/*
>  	 * We're finally done with prerequisite checking, and can start with
>  	 * the nested entry.
>  	 */
> 

Jan
Arthur Chunqi Li Aug. 25, 2013, 7:24 a.m. UTC | #2
On Sun, Aug 25, 2013 at 2:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2013-08-24 20:44, root wrote:
>> This patch contains the following two changes:
>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> with some reasons not emulated by L1, preemption timer value should
>> be save in such exits.
>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> to nVMX.
>>
>> With this patch, nested VMX preemption timer features are fully
>> supported.
>>
>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> ---
>>  arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
>>  1 file changed, 25 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 57b4e12..9579409 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2204,7 +2204,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>  #ifdef CONFIG_X86_64
>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>  #endif
>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>                                     VM_EXIT_LOAD_IA32_EFER);
>
> In the absence of VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, you need to hide
> PIN_BASED_VMX_PREEMPTION_TIMER from the guest as we cannot emulate its
> behavior properly in that case.
>
>>
>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>               (vmcs_config.pin_based_exec_ctrl |
>>                vmcs12->pin_based_vm_exec_control));
>>
>> -     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>> -             vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> -                          vmcs12->vmx_preemption_timer_value);
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
>> +             if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> +                     vmcs12->vmx_preemption_timer_value =
>> +                             vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +             else
>> +                     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> +                                     vmcs12->vmx_preemption_timer_value);
>> +     }
>
> This is not correct. We still need to set the vmcs to
> vmx_preemption_timer_value. The difference is that, on exit from L2,
> vmx_preemption_timer_value has to be updated according to the saved
> hardware state. The corresponding code is missing in your patch so far.
>
>>
>>       /*
>>        * Whether page-faults are trapped is determined by a combination of
>> @@ -7690,7 +7696,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>        * bits are further modified by vmx_set_efer() below.
>>        */
>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>> +             vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl |
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> +     else
>> +             vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>
> Let's prepare the value for VM_EXIT_CONTROLS in a local variable first,
> then write it to the vmcs.
>
>>
>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>        * emulated by vmx_set_efer(), below.
>> @@ -7912,6 +7922,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>>       }
>>
>>       /*
>> +      * If L2 support PIN_BASED_VMX_PREEMPTION_TIMER, L0 must support
>> +      * VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>> +      */
>> +     if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
>> +                     !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>> +             nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
>> +             return 1;
>> +     }
>
> Nope, the guest is free to run the preemption timer without saving on
> exits. It may have a valid use case for this, e.g. that it will always
> reprogram it on entry.
Here "!(nested_vmx_exit_ctls_high &
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)" is used to detect if hardware
support "save preemption timer" feature, which means if L2 supports
pinbased vmx preemption timer, host must support "save preemption
timer" feature. Though nested_vmx_exit_ctls_* is used for nested env,
but it can also used to reflect the host's feature. Here is what I
discuss with you yesterday, and we can also get the feature via
"rdmsr" here to avoid the confusion.

Arthur
>
>> +
>> +     /*
>>        * We're finally done with prerequisite checking, and can start with
>>        * the nested entry.
>>        */
>>
>
> Jan
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 7:28 a.m. UTC | #3
On 2013-08-25 09:24, Arthur Chunqi Li wrote:
> On Sun, Aug 25, 2013 at 2:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> On 2013-08-24 20:44, root wrote:
>>> This patch contains the following two changes:
>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>> with some reasons not emulated by L1, preemption timer value should
>>> be save in such exits.
>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>> to nVMX.
>>>
>>> With this patch, nested VMX preemption timer features are fully
>>> supported.
>>>
>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>> ---
>>>  arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
>>>  1 file changed, 25 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index 57b4e12..9579409 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -2204,7 +2204,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>>  #ifdef CONFIG_X86_64
>>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>>  #endif
>>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>>                                     VM_EXIT_LOAD_IA32_EFER);
>>
>> In the absence of VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, you need to hide
>> PIN_BASED_VMX_PREEMPTION_TIMER from the guest as we cannot emulate its
>> behavior properly in that case.
>>
>>>
>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>>               (vmcs_config.pin_based_exec_ctrl |
>>>                vmcs12->pin_based_vm_exec_control));
>>>
>>> -     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>>> -             vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>> -                          vmcs12->vmx_preemption_timer_value);
>>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
>>> +             if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>> +                     vmcs12->vmx_preemption_timer_value =
>>> +                             vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>> +             else
>>> +                     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>> +                                     vmcs12->vmx_preemption_timer_value);
>>> +     }
>>
>> This is not correct. We still need to set the vmcs to
>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>> vmx_preemption_timer_value has to be updated according to the saved
>> hardware state. The corresponding code is missing in your patch so far.
>>
>>>
>>>       /*
>>>        * Whether page-faults are trapped is determined by a combination of
>>> @@ -7690,7 +7696,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>>        * bits are further modified by vmx_set_efer() below.
>>>        */
>>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>>> +             vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl |
>>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>>> +     else
>>> +             vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>>
>> Let's prepare the value for VM_EXIT_CONTROLS in a local variable first,
>> then write it to the vmcs.
>>
>>>
>>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>>        * emulated by vmx_set_efer(), below.
>>> @@ -7912,6 +7922,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>>>       }
>>>
>>>       /*
>>> +      * If L2 support PIN_BASED_VMX_PREEMPTION_TIMER, L0 must support
>>> +      * VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>>> +      */
>>> +     if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
>>> +                     !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>>> +             nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
>>> +             return 1;
>>> +     }
>>
>> Nope, the guest is free to run the preemption timer without saving on
>> exits. It may have a valid use case for this, e.g. that it will always
>> reprogram it on entry.
> Here "!(nested_vmx_exit_ctls_high &
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)" is used to detect if hardware
> support "save preemption timer" feature, which means if L2 supports
> pinbased vmx preemption timer, host must support "save preemption
> timer" feature.

Sorry, parsed the code incorrectly.

> Though nested_vmx_exit_ctls_* is used for nested env,
> but it can also used to reflect the host's feature. Here is what I
> discuss with you yesterday, and we can also get the feature via
> "rdmsr" here to avoid the confusion.

Yes. The point is that we will not even expose
PIN_BASED_VMX_PREEMPTION_TIMER if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is
missing. If the guest then requests the former, it simply sets an
invalid pin-based control value which we already catch and report. So
this hunk becomes redundant.

Jan
Abel Gordon Aug. 25, 2013, 7:37 a.m. UTC | #4
> From: Jan Kiszka <jan.kiszka@web.de>
> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
> Date: 25/08/2013 09:44 AM
> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
timer
> Sent by: kvm-owner@vger.kernel.org
>
> On 2013-08-24 20:44, root wrote:
> > This patch contains the following two changes:
> > 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> > with some reasons not emulated by L1, preemption timer value should
> > be save in such exits.
> > 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> > to nVMX.
> >
> > With this patch, nested VMX preemption timer features are fully
> > supported.
> >
> > Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> > ---

> >
> > @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
> *vcpu, struct vmcs12 *vmcs12)
> >        (vmcs_config.pin_based_exec_ctrl |
> >         vmcs12->pin_based_vm_exec_control));
> >
> > -   if (vmcs12->pin_based_vm_exec_control &
PIN_BASED_VMX_PREEMPTION_TIMER)
> > -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> > -              vmcs12->vmx_preemption_timer_value);
> > +   if (vmcs12->pin_based_vm_exec_control &
> PIN_BASED_VMX_PREEMPTION_TIMER) {
> > +      if (vmcs12->vm_exit_controls &
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> > +         vmcs12->vmx_preemption_timer_value =
> > +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> > +      else
> > +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> > +               vmcs12->vmx_preemption_timer_value);
> > +   }
>
> This is not correct. We still need to set the vmcs to
> vmx_preemption_timer_value. The difference is that, on exit from L2,
> vmx_preemption_timer_value has to be updated according to the saved
> hardware state. The corresponding code is missing in your patch so far.

I think something else maybe be missing here: assuming L0 handles exits
for L2 without involving L1 (e.g. external interrupts or ept violations),
then, we may spend some cycles in L0 handling these exits. Note L1 is not
aware of these exits and from L1 perspective L2 was running on the CPU.
That means that we may need to reduce these cycles spent at
L0 from the preemtion timer or emulate a preemption timer exit to
force a transition to L1 instead of resuming L2.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arthur Chunqi Li Aug. 25, 2013, 7:37 a.m. UTC | #5
On Sun, Aug 25, 2013 at 3:28 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2013-08-25 09:24, Arthur Chunqi Li wrote:
>> On Sun, Aug 25, 2013 at 2:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>>> On 2013-08-24 20:44, root wrote:
>>>> This patch contains the following two changes:
>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>> with some reasons not emulated by L1, preemption timer value should
>>>> be save in such exits.
>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>>> to nVMX.
>>>>
>>>> With this patch, nested VMX preemption timer features are fully
>>>> supported.
>>>>
>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>> ---
>>>>  arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
>>>>  1 file changed, 25 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index 57b4e12..9579409 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -2204,7 +2204,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>>>  #ifdef CONFIG_X86_64
>>>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>>>  #endif
>>>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>>>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>>>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>>>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>>>                                     VM_EXIT_LOAD_IA32_EFER);
>>>
>>> In the absence of VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, you need to hide
>>> PIN_BASED_VMX_PREEMPTION_TIMER from the guest as we cannot emulate its
>>> behavior properly in that case.
Besides, we need to test that in the absence of
PIN_BASED_VMX_PREEMPTION_TIMER, we need to hide
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though this should not happen
according to Intel SDM.
>>>
>>>>
>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>>>               (vmcs_config.pin_based_exec_ctrl |
>>>>                vmcs12->pin_based_vm_exec_control));
>>>>
>>>> -     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>>>> -             vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>> -                          vmcs12->vmx_preemption_timer_value);
>>>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>> +             if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>> +                     vmcs12->vmx_preemption_timer_value =
>>>> +                             vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>> +             else
>>>> +                     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>> +                                     vmcs12->vmx_preemption_timer_value);
>>>> +     }
>>>
>>> This is not correct. We still need to set the vmcs to
>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>> vmx_preemption_timer_value has to be updated according to the saved
>>> hardware state. The corresponding code is missing in your patch so far.
>>>
>>>>
>>>>       /*
>>>>        * Whether page-faults are trapped is determined by a combination of
>>>> @@ -7690,7 +7696,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>>>        * bits are further modified by vmx_set_efer() below.
>>>>        */
>>>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>>>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>>>> +             vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl |
>>>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>>>> +     else
>>>> +             vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>>>
>>> Let's prepare the value for VM_EXIT_CONTROLS in a local variable first,
>>> then write it to the vmcs.
>>>
>>>>
>>>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>>>        * emulated by vmx_set_efer(), below.
>>>> @@ -7912,6 +7922,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>>>>       }
>>>>
>>>>       /*
>>>> +      * If L2 support PIN_BASED_VMX_PREEMPTION_TIMER, L0 must support
>>>> +      * VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>>>> +      */
>>>> +     if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
>>>> +                     !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>>>> +             nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
>>>> +             return 1;
>>>> +     }
>>>
>>> Nope, the guest is free to run the preemption timer without saving on
>>> exits. It may have a valid use case for this, e.g. that it will always
>>> reprogram it on entry.
>> Here "!(nested_vmx_exit_ctls_high &
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)" is used to detect if hardware
>> support "save preemption timer" feature, which means if L2 supports
>> pinbased vmx preemption timer, host must support "save preemption
>> timer" feature.
>
> Sorry, parsed the code incorrectly.
>
>> Though nested_vmx_exit_ctls_* is used for nested env,
>> but it can also used to reflect the host's feature. Here is what I
>> discuss with you yesterday, and we can also get the feature via
>> "rdmsr" here to avoid the confusion.
>
> Yes. The point is that we will not even expose
> PIN_BASED_VMX_PREEMPTION_TIMER if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is
> missing. If the guest then requests the former, it simply sets an
> invalid pin-based control value which we already catch and report. So
> this hunk becomes redundant.
Yep, if we check it when setting nested_vmx_*_ctls_high this hunk is
useless. Besides, see comments above.

Arthur
>
> Jan
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arthur Chunqi Li Aug. 25, 2013, 7:42 a.m. UTC | #6
On Sun, Aug 25, 2013 at 3:37 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>
>
>> From: Jan Kiszka <jan.kiszka@web.de>
>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>> Date: 25/08/2013 09:44 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On 2013-08-24 20:44, root wrote:
>> > This patch contains the following two changes:
>> > 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> > with some reasons not emulated by L1, preemption timer value should
>> > be save in such exits.
>> > 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> > to nVMX.
>> >
>> > With this patch, nested VMX preemption timer features are fully
>> > supported.
>> >
>> > Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> > ---
>
>> >
>> > @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>> *vcpu, struct vmcs12 *vmcs12)
>> >        (vmcs_config.pin_based_exec_ctrl |
>> >         vmcs12->pin_based_vm_exec_control));
>> >
>> > -   if (vmcs12->pin_based_vm_exec_control &
> PIN_BASED_VMX_PREEMPTION_TIMER)
>> > -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> > -              vmcs12->vmx_preemption_timer_value);
>> > +   if (vmcs12->pin_based_vm_exec_control &
>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>> > +      if (vmcs12->vm_exit_controls &
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> > +         vmcs12->vmx_preemption_timer_value =
>> > +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> > +      else
>> > +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> > +               vmcs12->vmx_preemption_timer_value);
>> > +   }
>>
>> This is not correct. We still need to set the vmcs to
>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>> vmx_preemption_timer_value has to be updated according to the saved
>> hardware state. The corresponding code is missing in your patch so far.
>
> I think something else maybe be missing here: assuming L0 handles exits
> for L2 without involving L1 (e.g. external interrupts or ept violations),
> then, we may spend some cycles in L0 handling these exits. Note L1 is not
> aware of these exits and from L1 perspective L2 was running on the CPU.
> That means that we may need to reduce these cycles spent at
> L0 from the preemtion timer or emulate a preemption timer exit to
> force a transition to L1 instead of resuming L2.
My solution is setting "save preemption value" feature of L2 if L2
sets "vmx preemption timer" feature, thus external interrupts (or
others) will save the exact value in L2's vmcs, and the resume of L2
will load the value in L2's vmcs. Thus cycles of handling these vmexit
in L0 will not affect L2's preemption value.

Arthur
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 7:43 a.m. UTC | #7
On 2013-08-25 09:37, Abel Gordon wrote:
> 
> 
>> From: Jan Kiszka <jan.kiszka@web.de>
>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>> Date: 25/08/2013 09:44 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On 2013-08-24 20:44, root wrote:
>>> This patch contains the following two changes:
>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>> with some reasons not emulated by L1, preemption timer value should
>>> be save in such exits.
>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>> to nVMX.
>>>
>>> With this patch, nested VMX preemption timer features are fully
>>> supported.
>>>
>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>> ---
> 
>>>
>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>> *vcpu, struct vmcs12 *vmcs12)
>>>        (vmcs_config.pin_based_exec_ctrl |
>>>         vmcs12->pin_based_vm_exec_control));
>>>
>>> -   if (vmcs12->pin_based_vm_exec_control &
> PIN_BASED_VMX_PREEMPTION_TIMER)
>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>> -              vmcs12->vmx_preemption_timer_value);
>>> +   if (vmcs12->pin_based_vm_exec_control &
>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>> +      if (vmcs12->vm_exit_controls &
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>> +         vmcs12->vmx_preemption_timer_value =
>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>> +      else
>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>> +               vmcs12->vmx_preemption_timer_value);
>>> +   }
>>
>> This is not correct. We still need to set the vmcs to
>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>> vmx_preemption_timer_value has to be updated according to the saved
>> hardware state. The corresponding code is missing in your patch so far.
> 
> I think something else maybe be missing here: assuming L0 handles exits
> for L2 without involving L1 (e.g. external interrupts or ept violations),
> then, we may spend some cycles in L0 handling these exits. Note L1 is not
> aware of these exits and from L1 perspective L2 was running on the CPU.
> That means that we may need to reduce these cycles spent at
> L0 from the preemtion timer or emulate a preemption timer exit to
> force a transition to L1 instead of resuming L2.

That's precisely what the logic I described should achieve: reload the
value we saved on L2 exit on reentry.

Jan
Jan Kiszka Aug. 25, 2013, 7:44 a.m. UTC | #8
On 2013-08-25 09:37, Arthur Chunqi Li wrote:
> On Sun, Aug 25, 2013 at 3:28 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> On 2013-08-25 09:24, Arthur Chunqi Li wrote:
>>> On Sun, Aug 25, 2013 at 2:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>>>> On 2013-08-24 20:44, root wrote:
>>>>> This patch contains the following two changes:
>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>> be save in such exits.
>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>>>> to nVMX.
>>>>>
>>>>> With this patch, nested VMX preemption timer features are fully
>>>>> supported.
>>>>>
>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>> ---
>>>>>  arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
>>>>>  1 file changed, 25 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>>> index 57b4e12..9579409 100644
>>>>> --- a/arch/x86/kvm/vmx.c
>>>>> +++ b/arch/x86/kvm/vmx.c
>>>>> @@ -2204,7 +2204,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>>>>  #ifdef CONFIG_X86_64
>>>>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>>>>  #endif
>>>>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>>>>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>>>>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>>>>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>>>>                                     VM_EXIT_LOAD_IA32_EFER);
>>>>
>>>> In the absence of VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, you need to hide
>>>> PIN_BASED_VMX_PREEMPTION_TIMER from the guest as we cannot emulate its
>>>> behavior properly in that case.
> Besides, we need to test that in the absence of
> PIN_BASED_VMX_PREEMPTION_TIMER, we need to hide
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though this should not happen
> according to Intel SDM.

If the SDM guarantees this for us, we don't need such a safety measure.
Otherwise, it should be added, yes.

Jan
Arthur Chunqi Li Aug. 25, 2013, 7:49 a.m. UTC | #9
On Sun, Aug 25, 2013 at 3:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2013-08-25 09:37, Arthur Chunqi Li wrote:
>> On Sun, Aug 25, 2013 at 3:28 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>>> On 2013-08-25 09:24, Arthur Chunqi Li wrote:
>>>> On Sun, Aug 25, 2013 at 2:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>>>>> On 2013-08-24 20:44, root wrote:
>>>>>> This patch contains the following two changes:
>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>>> be save in such exits.
>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>>>>> to nVMX.
>>>>>>
>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>> supported.
>>>>>>
>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>> ---
>>>>>>  arch/x86/kvm/vmx.c |   30 +++++++++++++++++++++++++-----
>>>>>>  1 file changed, 25 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>>>> index 57b4e12..9579409 100644
>>>>>> --- a/arch/x86/kvm/vmx.c
>>>>>> +++ b/arch/x86/kvm/vmx.c
>>>>>> @@ -2204,7 +2204,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>>>>>  #ifdef CONFIG_X86_64
>>>>>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>>>>>  #endif
>>>>>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>>>>>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>>>>>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>>>>>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>>>>>                                     VM_EXIT_LOAD_IA32_EFER);
>>>>>
>>>>> In the absence of VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, you need to hide
>>>>> PIN_BASED_VMX_PREEMPTION_TIMER from the guest as we cannot emulate its
>>>>> behavior properly in that case.
>> Besides, we need to test that in the absence of
>> PIN_BASED_VMX_PREEMPTION_TIMER, we need to hide
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though this should not happen
>> according to Intel SDM.
>
> If the SDM guarantees this for us, we don't need such a safety measure.
> Otherwise, it should be added, yes.
The SDM has such description (see 26.2.1.2):

If “activate VMX-preemption timer” VM-execution control is 0, the
“save VMX-preemption timer value” VM-exit
control must also be 0.

It doesn't tell us if these two flags are consistent when getting them
from related MSR (IA32_VMX_PINBASED_CTLS and IA32_VMX_EXIT_CTLS). So I
think the check is needed here.

Arthur
>
> Jan
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Abel Gordon Aug. 25, 2013, 7:50 a.m. UTC | #10
kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:

> From: Jan Kiszka <jan.kiszka@web.de>
> To: Abel Gordon/Haifa/IBM@IBMIL,
> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
> Date: 25/08/2013 10:43 AM
> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
timer
> Sent by: kvm-owner@vger.kernel.org
>
> On 2013-08-25 09:37, Abel Gordon wrote:
> >
> >
> >> From: Jan Kiszka <jan.kiszka@web.de>
> >> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
> >> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
> >> Date: 25/08/2013 09:44 AM
> >> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> > timer
> >> Sent by: kvm-owner@vger.kernel.org
> >>
> >> On 2013-08-24 20:44, root wrote:
> >>> This patch contains the following two changes:
> >>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> >>> with some reasons not emulated by L1, preemption timer value should
> >>> be save in such exits.
> >>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> >>> to nVMX.
> >>>
> >>> With this patch, nested VMX preemption timer features are fully
> >>> supported.
> >>>
> >>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >>> ---
> >
> >>>
> >>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
> >> *vcpu, struct vmcs12 *vmcs12)
> >>>        (vmcs_config.pin_based_exec_ctrl |
> >>>         vmcs12->pin_based_vm_exec_control));
> >>>
> >>> -   if (vmcs12->pin_based_vm_exec_control &
> > PIN_BASED_VMX_PREEMPTION_TIMER)
> >>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >>> -              vmcs12->vmx_preemption_timer_value);
> >>> +   if (vmcs12->pin_based_vm_exec_control &
> >> PIN_BASED_VMX_PREEMPTION_TIMER) {
> >>> +      if (vmcs12->vm_exit_controls &
> > VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> >>> +         vmcs12->vmx_preemption_timer_value =
> >>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >>> +      else
> >>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >>> +               vmcs12->vmx_preemption_timer_value);
> >>> +   }
> >>
> >> This is not correct. We still need to set the vmcs to
> >> vmx_preemption_timer_value. The difference is that, on exit from L2,
> >> vmx_preemption_timer_value has to be updated according to the saved
> >> hardware state. The corresponding code is missing in your patch so
far.
> >
> > I think something else maybe be missing here: assuming L0 handles exits
> > for L2 without involving L1 (e.g. external interrupts or ept
violations),
> > then, we may spend some cycles in L0 handling these exits. Note L1 is
not
> > aware of these exits and from L1 perspective L2 was running on the CPU.
> > That means that we may need to reduce these cycles spent at
> > L0 from the preemtion timer or emulate a preemption timer exit to
> > force a transition to L1 instead of resuming L2.
>
> That's precisely what the logic I described should achieve: reload the
> value we saved on L2 exit on reentry.

But don't you think we should also reduce the cycles spent at L0 from the
preemption timer ? I mean, if we spent X cycles at L0 handling a L2 exit
which was not forwarded to L1, then, before we resume L2,
the preemption timer should be: (previous_value_on_exit - X).
If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
preemption timer exit between L2 and L1.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 7:54 a.m. UTC | #11
On 2013-08-25 09:50, Abel Gordon wrote:
> 
> 
> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
> 
>> From: Jan Kiszka <jan.kiszka@web.de>
>> To: Abel Gordon/Haifa/IBM@IBMIL,
>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>> Date: 25/08/2013 10:43 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>
>>>
>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>> Date: 25/08/2013 09:44 AM
>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>> timer
>>>> Sent by: kvm-owner@vger.kernel.org
>>>>
>>>> On 2013-08-24 20:44, root wrote:
>>>>> This patch contains the following two changes:
>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>> be save in such exits.
>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>>>> to nVMX.
>>>>>
>>>>> With this patch, nested VMX preemption timer features are fully
>>>>> supported.
>>>>>
>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>> ---
>>>
>>>>>
>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>
>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>> +      if (vmcs12->vm_exit_controls &
>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>> +      else
>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>> +   }
>>>>
>>>> This is not correct. We still need to set the vmcs to
>>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>> hardware state. The corresponding code is missing in your patch so
> far.
>>>
>>> I think something else maybe be missing here: assuming L0 handles exits
>>> for L2 without involving L1 (e.g. external interrupts or ept
> violations),
>>> then, we may spend some cycles in L0 handling these exits. Note L1 is
> not
>>> aware of these exits and from L1 perspective L2 was running on the CPU.
>>> That means that we may need to reduce these cycles spent at
>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>> force a transition to L1 instead of resuming L2.
>>
>> That's precisely what the logic I described should achieve: reload the
>> value we saved on L2 exit on reentry.
> 
> But don't you think we should also reduce the cycles spent at L0 from the
> preemption timer ? I mean, if we spent X cycles at L0 handling a L2 exit
> which was not forwarded to L1, then, before we resume L2,
> the preemption timer should be: (previous_value_on_exit - X).
> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
> preemption timer exit between L2 and L1.

We ask the hardware to save the value of the preemption on L2 exit. This
value will be exposed to L1 (if it asked for saving as well) and/or be
written back to the hardware on L2 reenty (unless L1 had a chance to run
and modified it). So the time spent in L0 is implicitly subtracted.

Jan

PS: You had kvm-owner in CC.
Arthur Chunqi Li Aug. 25, 2013, 7:55 a.m. UTC | #12
On Sun, Aug 25, 2013 at 3:50 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>
>
> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>
>> From: Jan Kiszka <jan.kiszka@web.de>
>> To: Abel Gordon/Haifa/IBM@IBMIL,
>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>> Date: 25/08/2013 10:43 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On 2013-08-25 09:37, Abel Gordon wrote:
>> >
>> >
>> >> From: Jan Kiszka <jan.kiszka@web.de>
>> >> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>> >> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>> >> Date: 25/08/2013 09:44 AM
>> >> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>> > timer
>> >> Sent by: kvm-owner@vger.kernel.org
>> >>
>> >> On 2013-08-24 20:44, root wrote:
>> >>> This patch contains the following two changes:
>> >>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> >>> with some reasons not emulated by L1, preemption timer value should
>> >>> be save in such exits.
>> >>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> >>> to nVMX.
>> >>>
>> >>> With this patch, nested VMX preemption timer features are fully
>> >>> supported.
>> >>>
>> >>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> >>> ---
>> >
>> >>>
>> >>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>> >> *vcpu, struct vmcs12 *vmcs12)
>> >>>        (vmcs_config.pin_based_exec_ctrl |
>> >>>         vmcs12->pin_based_vm_exec_control));
>> >>>
>> >>> -   if (vmcs12->pin_based_vm_exec_control &
>> > PIN_BASED_VMX_PREEMPTION_TIMER)
>> >>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >>> -              vmcs12->vmx_preemption_timer_value);
>> >>> +   if (vmcs12->pin_based_vm_exec_control &
>> >> PIN_BASED_VMX_PREEMPTION_TIMER) {
>> >>> +      if (vmcs12->vm_exit_controls &
>> > VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> >>> +         vmcs12->vmx_preemption_timer_value =
>> >>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> >>> +      else
>> >>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >>> +               vmcs12->vmx_preemption_timer_value);
>> >>> +   }
>> >>
>> >> This is not correct. We still need to set the vmcs to
>> >> vmx_preemption_timer_value. The difference is that, on exit from L2,
>> >> vmx_preemption_timer_value has to be updated according to the saved
>> >> hardware state. The corresponding code is missing in your patch so
> far.
>> >
>> > I think something else maybe be missing here: assuming L0 handles exits
>> > for L2 without involving L1 (e.g. external interrupts or ept
> violations),
>> > then, we may spend some cycles in L0 handling these exits. Note L1 is
> not
>> > aware of these exits and from L1 perspective L2 was running on the CPU.
>> > That means that we may need to reduce these cycles spent at
>> > L0 from the preemtion timer or emulate a preemption timer exit to
>> > force a transition to L1 instead of resuming L2.
>>
>> That's precisely what the logic I described should achieve: reload the
>> value we saved on L2 exit on reentry.
>
> But don't you think we should also reduce the cycles spent at L0 from the
> preemption timer ? I mean, if we spent X cycles at L0 handling a L2 exit
> which was not forwarded to L1, then, before we resume L2,
> the preemption timer should be: (previous_value_on_exit - X).
> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
> preemption timer exit between L2 and L1.
Sorry, I previously misunderstand your comments. But why should we
need to exclude cycles in L0 from L2 preemption value? These cycles
are not spent by L2 and it should not be on L2.

Arthur
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Abel Gordon Aug. 25, 2013, 8:04 a.m. UTC | #13
kvm-owner@vger.kernel.org wrote on 25/08/2013 10:55:24 AM:

> From: Arthur Chunqi Li <yzt356@gmail.com>
> To: Abel Gordon/Haifa/IBM@IBMIL,
> Cc: Jan Kiszka <jan.kiszka@web.de>, Gleb Natapov <gleb@redhat.com>,
> kvm <kvm@vger.kernel.org>, kvm-owner@vger.kernel.org, Paolo Bonzini
> <pbonzini@redhat.com>
> Date: 25/08/2013 10:55 AM
> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
timer
> Sent by: kvm-owner@vger.kernel.org
>
> On Sun, Aug 25, 2013 at 3:50 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
> >
> >
> > kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
> >
> >> From: Jan Kiszka <jan.kiszka@web.de>
> >> To: Abel Gordon/Haifa/IBM@IBMIL,
> >> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
> >> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
> >> Date: 25/08/2013 10:43 AM
> >> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> > timer
> >> Sent by: kvm-owner@vger.kernel.org
> >>
> >> On 2013-08-25 09:37, Abel Gordon wrote:
> >> >
> >> >
> >> >> From: Jan Kiszka <jan.kiszka@web.de>
> >> >> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
> >> >> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
> >> >> Date: 25/08/2013 09:44 AM
> >> >> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
preemption
> >> > timer
> >> >> Sent by: kvm-owner@vger.kernel.org
> >> >>
> >> >> On 2013-08-24 20:44, root wrote:
> >> >>> This patch contains the following two changes:
> >> >>> 1. Fix the bug in nested preemption timer support. If vmexit L2->
L0
> >> >>> with some reasons not emulated by L1, preemption timer value
should
> >> >>> be save in such exits.
> >> >>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
controls
> >> >>> to nVMX.
> >> >>>
> >> >>> With this patch, nested VMX preemption timer features are fully
> >> >>> supported.
> >> >>>
> >> >>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >> >>> ---
> >> >
> >> >>>
> >> >>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
> >> >> *vcpu, struct vmcs12 *vmcs12)
> >> >>>        (vmcs_config.pin_based_exec_ctrl |
> >> >>>         vmcs12->pin_based_vm_exec_control));
> >> >>>
> >> >>> -   if (vmcs12->pin_based_vm_exec_control &
> >> > PIN_BASED_VMX_PREEMPTION_TIMER)
> >> >>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >> >>> -              vmcs12->vmx_preemption_timer_value);
> >> >>> +   if (vmcs12->pin_based_vm_exec_control &
> >> >> PIN_BASED_VMX_PREEMPTION_TIMER) {
> >> >>> +      if (vmcs12->vm_exit_controls &
> >> > VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> >> >>> +         vmcs12->vmx_preemption_timer_value =
> >> >>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >> >>> +      else
> >> >>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >> >>> +               vmcs12->vmx_preemption_timer_value);
> >> >>> +   }
> >> >>
> >> >> This is not correct. We still need to set the vmcs to
> >> >> vmx_preemption_timer_value. The difference is that, on exit from
L2,
> >> >> vmx_preemption_timer_value has to be updated according to the saved
> >> >> hardware state. The corresponding code is missing in your patch so
> > far.
> >> >
> >> > I think something else maybe be missing here: assuming L0 handles
exits
> >> > for L2 without involving L1 (e.g. external interrupts or ept
> > violations),
> >> > then, we may spend some cycles in L0 handling these exits. Note L1
is
> > not
> >> > aware of these exits and from L1 perspective L2 was running on the
CPU.
> >> > That means that we may need to reduce these cycles spent at
> >> > L0 from the preemtion timer or emulate a preemption timer exit to
> >> > force a transition to L1 instead of resuming L2.
> >>
> >> That's precisely what the logic I described should achieve: reload the
> >> value we saved on L2 exit on reentry.
> >
> > But don't you think we should also reduce the cycles spent at L0 from
the
> > preemption timer ? I mean, if we spent X cycles at L0 handling a L2
exit
> > which was not forwarded to L1, then, before we resume L2,
> > the preemption timer should be: (previous_value_on_exit - X).
> > If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
> > preemption timer exit between L2 and L1.
> Sorry, I previously misunderstand your comments. But why should we
> need to exclude cycles in L0 from L2 preemption value? These cycles
> are not spent by L2 and it should not be on L2.

L1 asked the "hardware" (emulated by L0) to run L2 and force an exit
after "Y" cycles. Now, in practice, we may spend "X" cycles at L0 handling
exits without switching to L1. That means that from L1 perspective L2
was running all these X cycles. L1 should assume that the instructions per
cycle
the CPU executed decreased but the cycles were spent. That's why I believe
you should take in account these X cycles.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 8:13 a.m. UTC | #14
On 2013-08-25 10:04, Abel Gordon wrote:
> 
> 
> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:55:24 AM:
> 
>> From: Arthur Chunqi Li <yzt356@gmail.com>
>> To: Abel Gordon/Haifa/IBM@IBMIL,
>> Cc: Jan Kiszka <jan.kiszka@web.de>, Gleb Natapov <gleb@redhat.com>,
>> kvm <kvm@vger.kernel.org>, kvm-owner@vger.kernel.org, Paolo Bonzini
>> <pbonzini@redhat.com>
>> Date: 25/08/2013 10:55 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On Sun, Aug 25, 2013 at 3:50 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>>>
>>>
>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>>>
>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>>>> Date: 25/08/2013 10:43 AM
>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>> timer
>>>> Sent by: kvm-owner@vger.kernel.org
>>>>
>>>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>>>
>>>>>
>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>>>> Date: 25/08/2013 09:44 AM
>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
> preemption
>>>>> timer
>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>
>>>>>> On 2013-08-24 20:44, root wrote:
>>>>>>> This patch contains the following two changes:
>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->
> L0
>>>>>>> with some reasons not emulated by L1, preemption timer value
> should
>>>>>>> be save in such exits.
>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
> controls
>>>>>>> to nVMX.
>>>>>>>
>>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>>> supported.
>>>>>>>
>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>>> ---
>>>>>
>>>>>>>
>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>>>
>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>>>> +      if (vmcs12->vm_exit_controls &
>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>>>> +      else
>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>>>> +   }
>>>>>>
>>>>>> This is not correct. We still need to set the vmcs to
>>>>>> vmx_preemption_timer_value. The difference is that, on exit from
> L2,
>>>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>>>> hardware state. The corresponding code is missing in your patch so
>>> far.
>>>>>
>>>>> I think something else maybe be missing here: assuming L0 handles
> exits
>>>>> for L2 without involving L1 (e.g. external interrupts or ept
>>> violations),
>>>>> then, we may spend some cycles in L0 handling these exits. Note L1
> is
>>> not
>>>>> aware of these exits and from L1 perspective L2 was running on the
> CPU.
>>>>> That means that we may need to reduce these cycles spent at
>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>>>> force a transition to L1 instead of resuming L2.
>>>>
>>>> That's precisely what the logic I described should achieve: reload the
>>>> value we saved on L2 exit on reentry.
>>>
>>> But don't you think we should also reduce the cycles spent at L0 from
> the
>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
> exit
>>> which was not forwarded to L1, then, before we resume L2,
>>> the preemption timer should be: (previous_value_on_exit - X).
>>> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>>> preemption timer exit between L2 and L1.
>> Sorry, I previously misunderstand your comments. But why should we
>> need to exclude cycles in L0 from L2 preemption value? These cycles
>> are not spent by L2 and it should not be on L2.
> 
> L1 asked the "hardware" (emulated by L0) to run L2 and force an exit
> after "Y" cycles. Now, in practice, we may spend "X" cycles at L0 handling
> exits without switching to L1. That means that from L1 perspective L2
> was running all these X cycles. L1 should assume that the instructions per
> cycle
> the CPU executed decreased but the cycles were spent. That's why I believe
> you should take in account these X cycles.
> 

Now I get it. There is likely some truth in this as the reference clock
for the preemption timer, the TSC, isn't stopped for L1/L2 while running
in L0. And the SDM demands the countdown to be proportional to that clock.

Jan
Abel Gordon Aug. 25, 2013, 8:18 a.m. UTC | #15
kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:

> From: Jan Kiszka <jan.kiszka@web.de>
> To: Abel Gordon/Haifa/IBM@IBMIL,
> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
> Date: 25/08/2013 10:54 AM
> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
timer
> Sent by: kvm-owner@vger.kernel.org
>
> On 2013-08-25 09:50, Abel Gordon wrote:
> >
> >
> > kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
> >
> >> From: Jan Kiszka <jan.kiszka@web.de>
> >> To: Abel Gordon/Haifa/IBM@IBMIL,
> >> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
> >> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
> >> Date: 25/08/2013 10:43 AM
> >> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> > timer
> >> Sent by: kvm-owner@vger.kernel.org
> >>
> >> On 2013-08-25 09:37, Abel Gordon wrote:
> >>>
> >>>
> >>>> From: Jan Kiszka <jan.kiszka@web.de>
> >>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
> >>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
> >>>> Date: 25/08/2013 09:44 AM
> >>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
preemption
> >>> timer
> >>>> Sent by: kvm-owner@vger.kernel.org
> >>>>
> >>>> On 2013-08-24 20:44, root wrote:
> >>>>> This patch contains the following two changes:
> >>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> >>>>> with some reasons not emulated by L1, preemption timer value should
> >>>>> be save in such exits.
> >>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
controls
> >>>>> to nVMX.
> >>>>>
> >>>>> With this patch, nested VMX preemption timer features are fully
> >>>>> supported.
> >>>>>
> >>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >>>>> ---
> >>>
> >>>>>
> >>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
> >>>> *vcpu, struct vmcs12 *vmcs12)
> >>>>>        (vmcs_config.pin_based_exec_ctrl |
> >>>>>         vmcs12->pin_based_vm_exec_control));
> >>>>>
> >>>>> -   if (vmcs12->pin_based_vm_exec_control &
> >>> PIN_BASED_VMX_PREEMPTION_TIMER)
> >>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >>>>> -              vmcs12->vmx_preemption_timer_value);
> >>>>> +   if (vmcs12->pin_based_vm_exec_control &
> >>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
> >>>>> +      if (vmcs12->vm_exit_controls &
> >>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> >>>>> +         vmcs12->vmx_preemption_timer_value =
> >>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >>>>> +      else
> >>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >>>>> +               vmcs12->vmx_preemption_timer_value);
> >>>>> +   }
> >>>>
> >>>> This is not correct. We still need to set the vmcs to
> >>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
> >>>> vmx_preemption_timer_value has to be updated according to the saved
> >>>> hardware state. The corresponding code is missing in your patch so
> > far.
> >>>
> >>> I think something else maybe be missing here: assuming L0 handles
exits
> >>> for L2 without involving L1 (e.g. external interrupts or ept
> > violations),
> >>> then, we may spend some cycles in L0 handling these exits. Note L1 is
> > not
> >>> aware of these exits and from L1 perspective L2 was running on the
CPU.
> >>> That means that we may need to reduce these cycles spent at
> >>> L0 from the preemtion timer or emulate a preemption timer exit to
> >>> force a transition to L1 instead of resuming L2.
> >>
> >> That's precisely what the logic I described should achieve: reload the
> >> value we saved on L2 exit on reentry.
> >
> > But don't you think we should also reduce the cycles spent at L0 from
the
> > preemption timer ? I mean, if we spent X cycles at L0 handling a L2
exit
> > which was not forwarded to L1, then, before we resume L2,
> > the preemption timer should be: (previous_value_on_exit - X).
> > If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
> > preemption timer exit between L2 and L1.
>
> We ask the hardware to save the value of the preemption on L2 exit. This
> value will be exposed to L1 (if it asked for saving as well) and/or be
> written back to the hardware on L2 reenty (unless L1 had a chance to run
> and modified it). So the time spent in L0 is implicitly subtracted.

I think you are suggesting the following, please correct me if I am wrong.
1) L1 resumes L2 with preemption timer enabled
2) L0 emulates the resume/launch
3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
timer specified by L1)
4) L0 saved the preemption timer (original value - Y)
5) L0 spends X cycles handling the external interrupt
6) L0 resumes L2 with preemption timer = original value - Y

Note that in this case "X is ignored".

I was suggesting to do the following:
6) If original value - Y - X > 0 then
 L0 resumes L2 with preemption timer = original value - Y - X
else
 L0 emulates a L2->L1 preemption timer exit (resumes L1)





--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 8:25 a.m. UTC | #16
On 2013-08-25 10:18, Abel Gordon wrote:
> 
> 
> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
> 
>> From: Jan Kiszka <jan.kiszka@web.de>
>> To: Abel Gordon/Haifa/IBM@IBMIL,
>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
>> Date: 25/08/2013 10:54 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On 2013-08-25 09:50, Abel Gordon wrote:
>>>
>>>
>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>>>
>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>>>> Date: 25/08/2013 10:43 AM
>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>> timer
>>>> Sent by: kvm-owner@vger.kernel.org
>>>>
>>>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>>>
>>>>>
>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>>>> Date: 25/08/2013 09:44 AM
>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
> preemption
>>>>> timer
>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>
>>>>>> On 2013-08-24 20:44, root wrote:
>>>>>>> This patch contains the following two changes:
>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>>>> be save in such exits.
>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
> controls
>>>>>>> to nVMX.
>>>>>>>
>>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>>> supported.
>>>>>>>
>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>>> ---
>>>>>
>>>>>>>
>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>>>
>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>>>> +      if (vmcs12->vm_exit_controls &
>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>>>> +      else
>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>>>> +   }
>>>>>>
>>>>>> This is not correct. We still need to set the vmcs to
>>>>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>>>> hardware state. The corresponding code is missing in your patch so
>>> far.
>>>>>
>>>>> I think something else maybe be missing here: assuming L0 handles
> exits
>>>>> for L2 without involving L1 (e.g. external interrupts or ept
>>> violations),
>>>>> then, we may spend some cycles in L0 handling these exits. Note L1 is
>>> not
>>>>> aware of these exits and from L1 perspective L2 was running on the
> CPU.
>>>>> That means that we may need to reduce these cycles spent at
>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>>>> force a transition to L1 instead of resuming L2.
>>>>
>>>> That's precisely what the logic I described should achieve: reload the
>>>> value we saved on L2 exit on reentry.
>>>
>>> But don't you think we should also reduce the cycles spent at L0 from
> the
>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
> exit
>>> which was not forwarded to L1, then, before we resume L2,
>>> the preemption timer should be: (previous_value_on_exit - X).
>>> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>>> preemption timer exit between L2 and L1.
>>
>> We ask the hardware to save the value of the preemption on L2 exit. This
>> value will be exposed to L1 (if it asked for saving as well) and/or be
>> written back to the hardware on L2 reenty (unless L1 had a chance to run
>> and modified it). So the time spent in L0 is implicitly subtracted.
> 
> I think you are suggesting the following, please correct me if I am wrong.
> 1) L1 resumes L2 with preemption timer enabled
> 2) L0 emulates the resume/launch
> 3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
> timer specified by L1)
> 4) L0 saved the preemption timer (original value - Y)
> 5) L0 spends X cycles handling the external interrupt
> 6) L0 resumes L2 with preemption timer = original value - Y
> 
> Note that in this case "X is ignored".

Yes, but see my other reply.

> 
> I was suggesting to do the following:
> 6) If original value - Y - X > 0 then
>  L0 resumes L2 with preemption timer = original value - Y - X
> else
>  L0 emulates a L2->L1 preemption timer exit (resumes L1)

Almost . 6) should be:
If exit to L1 occurred after last L2, set X to 0. Then load MAX(original
value - Y - X, 0).

The hardware will trigger the exit for us.

Jan
Jan Kiszka Aug. 25, 2013, 8:27 a.m. UTC | #17
On 2013-08-25 10:25, Jan Kiszka wrote:
> On 2013-08-25 10:18, Abel Gordon wrote:
>>
>>
>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
>>
>>> From: Jan Kiszka <jan.kiszka@web.de>
>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
>>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
>>> Date: 25/08/2013 10:54 AM
>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>> timer
>>> Sent by: kvm-owner@vger.kernel.org
>>>
>>> On 2013-08-25 09:50, Abel Gordon wrote:
>>>>
>>>>
>>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>>>>
>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>>>>> Date: 25/08/2013 10:43 AM
>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>>> timer
>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>
>>>>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>>>>
>>>>>>
>>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>>>>> Date: 25/08/2013 09:44 AM
>>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
>> preemption
>>>>>> timer
>>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>>
>>>>>>> On 2013-08-24 20:44, root wrote:
>>>>>>>> This patch contains the following two changes:
>>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>>>>> be save in such exits.
>>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
>> controls
>>>>>>>> to nVMX.
>>>>>>>>
>>>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>>>> supported.
>>>>>>>>
>>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>>>> ---
>>>>>>
>>>>>>>>
>>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>>>>
>>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>>>>> +      if (vmcs12->vm_exit_controls &
>>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>>>>> +      else
>>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>>>>> +   }
>>>>>>>
>>>>>>> This is not correct. We still need to set the vmcs to
>>>>>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>>>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>>>>> hardware state. The corresponding code is missing in your patch so
>>>> far.
>>>>>>
>>>>>> I think something else maybe be missing here: assuming L0 handles
>> exits
>>>>>> for L2 without involving L1 (e.g. external interrupts or ept
>>>> violations),
>>>>>> then, we may spend some cycles in L0 handling these exits. Note L1 is
>>>> not
>>>>>> aware of these exits and from L1 perspective L2 was running on the
>> CPU.
>>>>>> That means that we may need to reduce these cycles spent at
>>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>>>>> force a transition to L1 instead of resuming L2.
>>>>>
>>>>> That's precisely what the logic I described should achieve: reload the
>>>>> value we saved on L2 exit on reentry.
>>>>
>>>> But don't you think we should also reduce the cycles spent at L0 from
>> the
>>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
>> exit
>>>> which was not forwarded to L1, then, before we resume L2,
>>>> the preemption timer should be: (previous_value_on_exit - X).
>>>> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>>>> preemption timer exit between L2 and L1.
>>>
>>> We ask the hardware to save the value of the preemption on L2 exit. This
>>> value will be exposed to L1 (if it asked for saving as well) and/or be
>>> written back to the hardware on L2 reenty (unless L1 had a chance to run
>>> and modified it). So the time spent in L0 is implicitly subtracted.
>>
>> I think you are suggesting the following, please correct me if I am wrong.
>> 1) L1 resumes L2 with preemption timer enabled
>> 2) L0 emulates the resume/launch
>> 3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
>> timer specified by L1)
>> 4) L0 saved the preemption timer (original value - Y)
>> 5) L0 spends X cycles handling the external interrupt
>> 6) L0 resumes L2 with preemption timer = original value - Y
>>
>> Note that in this case "X is ignored".
> 
> Yes, but see my other reply.
> 
>>
>> I was suggesting to do the following:
>> 6) If original value - Y - X > 0 then
>>  L0 resumes L2 with preemption timer = original value - Y - X
>> else
>>  L0 emulates a L2->L1 preemption timer exit (resumes L1)
> 
> Almost . 6) should be:
> If exit to L1 occurred after last L2, set X to 0. Then load MAX(original
> value - Y - X, 0).

Hmm, no:

If exit to L1 occurred after last L2, load value of vmcs12, else load
MAX(original
value - Y - X, 0).

Jan
Abel Gordon Aug. 25, 2013, 8:39 a.m. UTC | #18
Jan Kiszka <jan.kiszka@web.de> wrote on 25/08/2013 11:27:22 AM:

> From: Jan Kiszka <jan.kiszka@web.de>
> To: Abel Gordon/Haifa/IBM@IBMIL,
> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
> "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
> Date: 25/08/2013 11:27 AM
> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
timer
>
> On 2013-08-25 10:25, Jan Kiszka wrote:
> > On 2013-08-25 10:18, Abel Gordon wrote:
> >>
> >>
> >> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
> >>
> >>> From: Jan Kiszka <jan.kiszka@web.de>
> >>> To: Abel Gordon/Haifa/IBM@IBMIL,
> >>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
> >>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
> >>> Date: 25/08/2013 10:54 AM
> >>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
preemption
> >> timer
> >>> Sent by: kvm-owner@vger.kernel.org
> >>>
> >>> On 2013-08-25 09:50, Abel Gordon wrote:
> >>>>
> >>>>
> >>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
> >>>>
> >>>>> From: Jan Kiszka <jan.kiszka@web.de>
> >>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
> >>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org,
kvm-owner@vger.kernel.org,
> >>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
> >>>>> Date: 25/08/2013 10:43 AM
> >>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
preemption
> >>>> timer
> >>>>> Sent by: kvm-owner@vger.kernel.org
> >>>>>
> >>>>> On 2013-08-25 09:37, Abel Gordon wrote:
> >>>>>>
> >>>>>>
> >>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
> >>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
> >>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
> >>>>>>> Date: 25/08/2013 09:44 AM
> >>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
> >> preemption
> >>>>>> timer
> >>>>>>> Sent by: kvm-owner@vger.kernel.org
> >>>>>>>
> >>>>>>> On 2013-08-24 20:44, root wrote:
> >>>>>>>> This patch contains the following two changes:
> >>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit
L2->L0
> >>>>>>>> with some reasons not emulated by L1, preemption timer value
should
> >>>>>>>> be save in such exits.
> >>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
> >> controls
> >>>>>>>> to nVMX.
> >>>>>>>>
> >>>>>>>> With this patch, nested VMX preemption timer features are fully
> >>>>>>>> supported.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >>>>>>>> ---
> >>>>>>
> >>>>>>>>
> >>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct
kvm_vcpu
> >>>>>>> *vcpu, struct vmcs12 *vmcs12)
> >>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
> >>>>>>>>         vmcs12->pin_based_vm_exec_control));
> >>>>>>>>
> >>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
> >>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
> >>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >>>>>>>> -              vmcs12->vmx_preemption_timer_value);
> >>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
> >>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
> >>>>>>>> +      if (vmcs12->vm_exit_controls &
> >>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> >>>>>>>> +         vmcs12->vmx_preemption_timer_value =
> >>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >>>>>>>> +      else
> >>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >>>>>>>> +               vmcs12->vmx_preemption_timer_value);
> >>>>>>>> +   }
> >>>>>>>
> >>>>>>> This is not correct. We still need to set the vmcs to
> >>>>>>> vmx_preemption_timer_value. The difference is that, on exit from
L2,
> >>>>>>> vmx_preemption_timer_value has to be updated according to the
saved
> >>>>>>> hardware state. The corresponding code is missing in your patch
so
> >>>> far.
> >>>>>>
> >>>>>> I think something else maybe be missing here: assuming L0 handles
> >> exits
> >>>>>> for L2 without involving L1 (e.g. external interrupts or ept
> >>>> violations),
> >>>>>> then, we may spend some cycles in L0 handling these exits. Note L1
is
> >>>> not
> >>>>>> aware of these exits and from L1 perspective L2 was running on the
> >> CPU.
> >>>>>> That means that we may need to reduce these cycles spent at
> >>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
> >>>>>> force a transition to L1 instead of resuming L2.
> >>>>>
> >>>>> That's precisely what the logic I described should achieve: reload
the
> >>>>> value we saved on L2 exit on reentry.
> >>>>
> >>>> But don't you think we should also reduce the cycles spent at L0
from
> >> the
> >>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
> >> exit
> >>>> which was not forwarded to L1, then, before we resume L2,
> >>>> the preemption timer should be: (previous_value_on_exit - X).
> >>>> If (previous_value_on_exit - X) < 0, then we should force
("emulate") a
> >>>> preemption timer exit between L2 and L1.
> >>>
> >>> We ask the hardware to save the value of the preemption on L2 exit.
This
> >>> value will be exposed to L1 (if it asked for saving as well) and/or
be
> >>> written back to the hardware on L2 reenty (unless L1 had a chance to
run
> >>> and modified it). So the time spent in L0 is implicitly subtracted.
> >>
> >> I think you are suggesting the following, please correct me if I am
wrong.
> >> 1) L1 resumes L2 with preemption timer enabled
> >> 2) L0 emulates the resume/launch
> >> 3) L2 runs for Y cycles until an external interrupt occurs (Y <
preemption
> >> timer specified by L1)
> >> 4) L0 saved the preemption timer (original value - Y)
> >> 5) L0 spends X cycles handling the external interrupt
> >> 6) L0 resumes L2 with preemption timer = original value - Y
> >>
> >> Note that in this case "X is ignored".
> >
> > Yes, but see my other reply.

I sent my reply before I read yours, sorry.
Anyway, we are now on the same page ;)


> >> I was suggesting to do the following:
> >> 6) If original value - Y - X > 0 then
> >>  L0 resumes L2 with preemption timer = original value - Y - X
> >> else
> >>  L0 emulates a L2->L1 preemption timer exit (resumes L1)
> >
> > Almost . 6) should be:
> > If exit to L1 occurred after last L2, set X to 0. Then load MAX
(original
> > value - Y - X, 0).
>
> Hmm, no:
>
> If exit to L1 occurred after last L2, load value of vmcs12, else load
> MAX(original
> value - Y - X, 0).

Note you are resuming L2 to force an immediate exit. I agree this approach
will be easier and cleaner to implement/maintain but it could force one
more
exit and entry.  Anyway, any approach is welcome as long as it considers
the
cycles spent at L0 ("X") as we previously discussed and agreed.

Regards,
Abel.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arthur Chunqi Li Aug. 25, 2013, 8:41 a.m. UTC | #19
On Sun, Aug 25, 2013 at 4:18 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>
>
> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
>
>> From: Jan Kiszka <jan.kiszka@web.de>
>> To: Abel Gordon/Haifa/IBM@IBMIL,
>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
>> Date: 25/08/2013 10:54 AM
>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
> timer
>> Sent by: kvm-owner@vger.kernel.org
>>
>> On 2013-08-25 09:50, Abel Gordon wrote:
>> >
>> >
>> > kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>> >
>> >> From: Jan Kiszka <jan.kiszka@web.de>
>> >> To: Abel Gordon/Haifa/IBM@IBMIL,
>> >> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>> >> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>> >> Date: 25/08/2013 10:43 AM
>> >> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>> > timer
>> >> Sent by: kvm-owner@vger.kernel.org
>> >>
>> >> On 2013-08-25 09:37, Abel Gordon wrote:
>> >>>
>> >>>
>> >>>> From: Jan Kiszka <jan.kiszka@web.de>
>> >>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>> >>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>> >>>> Date: 25/08/2013 09:44 AM
>> >>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
> preemption
>> >>> timer
>> >>>> Sent by: kvm-owner@vger.kernel.org
>> >>>>
>> >>>> On 2013-08-24 20:44, root wrote:
>> >>>>> This patch contains the following two changes:
>> >>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> >>>>> with some reasons not emulated by L1, preemption timer value should
>> >>>>> be save in such exits.
>> >>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
> controls
>> >>>>> to nVMX.
>> >>>>>
>> >>>>> With this patch, nested VMX preemption timer features are fully
>> >>>>> supported.
>> >>>>>
>> >>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> >>>>> ---
>> >>>
>> >>>>>
>> >>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>> >>>> *vcpu, struct vmcs12 *vmcs12)
>> >>>>>        (vmcs_config.pin_based_exec_ctrl |
>> >>>>>         vmcs12->pin_based_vm_exec_control));
>> >>>>>
>> >>>>> -   if (vmcs12->pin_based_vm_exec_control &
>> >>> PIN_BASED_VMX_PREEMPTION_TIMER)
>> >>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >>>>> -              vmcs12->vmx_preemption_timer_value);
>> >>>>> +   if (vmcs12->pin_based_vm_exec_control &
>> >>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>> >>>>> +      if (vmcs12->vm_exit_controls &
>> >>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> >>>>> +         vmcs12->vmx_preemption_timer_value =
>> >>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> >>>>> +      else
>> >>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >>>>> +               vmcs12->vmx_preemption_timer_value);
>> >>>>> +   }
>> >>>>
>> >>>> This is not correct. We still need to set the vmcs to
>> >>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>> >>>> vmx_preemption_timer_value has to be updated according to the saved
>> >>>> hardware state. The corresponding code is missing in your patch so
>> > far.
>> >>>
>> >>> I think something else maybe be missing here: assuming L0 handles
> exits
>> >>> for L2 without involving L1 (e.g. external interrupts or ept
>> > violations),
>> >>> then, we may spend some cycles in L0 handling these exits. Note L1 is
>> > not
>> >>> aware of these exits and from L1 perspective L2 was running on the
> CPU.
>> >>> That means that we may need to reduce these cycles spent at
>> >>> L0 from the preemtion timer or emulate a preemption timer exit to
>> >>> force a transition to L1 instead of resuming L2.
>> >>
>> >> That's precisely what the logic I described should achieve: reload the
>> >> value we saved on L2 exit on reentry.
>> >
>> > But don't you think we should also reduce the cycles spent at L0 from
> the
>> > preemption timer ? I mean, if we spent X cycles at L0 handling a L2
> exit
>> > which was not forwarded to L1, then, before we resume L2,
>> > the preemption timer should be: (previous_value_on_exit - X).
>> > If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>> > preemption timer exit between L2 and L1.
>>
>> We ask the hardware to save the value of the preemption on L2 exit. This
>> value will be exposed to L1 (if it asked for saving as well) and/or be
>> written back to the hardware on L2 reenty (unless L1 had a chance to run
>> and modified it). So the time spent in L0 is implicitly subtracted.
>
> I think you are suggesting the following, please correct me if I am wrong.
> 1) L1 resumes L2 with preemption timer enabled
> 2) L0 emulates the resume/launch
> 3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
> timer specified by L1)
> 4) L0 saved the preemption timer (original value - Y)
> 5) L0 spends X cycles handling the external interrupt
> 6) L0 resumes L2 with preemption timer = original value - Y
>
> Note that in this case "X is ignored".
>
> I was suggesting to do the following:
> 6) If original value - Y - X > 0 then
>  L0 resumes L2 with preemption timer = original value - Y - X
> else
>  L0 emulates a L2->L1 preemption timer exit (resumes L1)
Yes, your description is right. But I'm also thinking about my
previous consideration, why should we consider such X cycles as what
L2 spent. For nested VMX. external interrupt is not provided by L1, it
is triggered from L0 and want to cause periodically exit to L1, L2 is
"accidentally injure" actually. Since these interrupts are not
generated from L1 and not attend to affect L2, these cycles should not
be treated as what L2 spent. Though these cycles are "spent" in view
of L1, but they should not be taken into consideration in nested VMX.

For another example, if vcpu scheduled out when L0 handing such
interrupts and CPU does some other things then schedule this vcpu
again, these cycles of executing other processes should not be treated
as what L2 spent definitely.

Arthur
>
>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 8:53 a.m. UTC | #20
On 2013-08-25 10:41, Arthur Chunqi Li wrote:
> On Sun, Aug 25, 2013 at 4:18 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>>
>>
>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
>>
>>> From: Jan Kiszka <jan.kiszka@web.de>
>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
>>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
>>> Date: 25/08/2013 10:54 AM
>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>> timer
>>> Sent by: kvm-owner@vger.kernel.org
>>>
>>> On 2013-08-25 09:50, Abel Gordon wrote:
>>>>
>>>>
>>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>>>>
>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>>>>> Date: 25/08/2013 10:43 AM
>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>>> timer
>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>
>>>>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>>>>
>>>>>>
>>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>>>>> Date: 25/08/2013 09:44 AM
>>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
>> preemption
>>>>>> timer
>>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>>
>>>>>>> On 2013-08-24 20:44, root wrote:
>>>>>>>> This patch contains the following two changes:
>>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>>>>> be save in such exits.
>>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
>> controls
>>>>>>>> to nVMX.
>>>>>>>>
>>>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>>>> supported.
>>>>>>>>
>>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>>>> ---
>>>>>>
>>>>>>>>
>>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>>>>
>>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>>>>> +      if (vmcs12->vm_exit_controls &
>>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>>>>> +      else
>>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>>>>> +   }
>>>>>>>
>>>>>>> This is not correct. We still need to set the vmcs to
>>>>>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>>>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>>>>> hardware state. The corresponding code is missing in your patch so
>>>> far.
>>>>>>
>>>>>> I think something else maybe be missing here: assuming L0 handles
>> exits
>>>>>> for L2 without involving L1 (e.g. external interrupts or ept
>>>> violations),
>>>>>> then, we may spend some cycles in L0 handling these exits. Note L1 is
>>>> not
>>>>>> aware of these exits and from L1 perspective L2 was running on the
>> CPU.
>>>>>> That means that we may need to reduce these cycles spent at
>>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>>>>> force a transition to L1 instead of resuming L2.
>>>>>
>>>>> That's precisely what the logic I described should achieve: reload the
>>>>> value we saved on L2 exit on reentry.
>>>>
>>>> But don't you think we should also reduce the cycles spent at L0 from
>> the
>>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
>> exit
>>>> which was not forwarded to L1, then, before we resume L2,
>>>> the preemption timer should be: (previous_value_on_exit - X).
>>>> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>>>> preemption timer exit between L2 and L1.
>>>
>>> We ask the hardware to save the value of the preemption on L2 exit. This
>>> value will be exposed to L1 (if it asked for saving as well) and/or be
>>> written back to the hardware on L2 reenty (unless L1 had a chance to run
>>> and modified it). So the time spent in L0 is implicitly subtracted.
>>
>> I think you are suggesting the following, please correct me if I am wrong.
>> 1) L1 resumes L2 with preemption timer enabled
>> 2) L0 emulates the resume/launch
>> 3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
>> timer specified by L1)
>> 4) L0 saved the preemption timer (original value - Y)
>> 5) L0 spends X cycles handling the external interrupt
>> 6) L0 resumes L2 with preemption timer = original value - Y
>>
>> Note that in this case "X is ignored".
>>
>> I was suggesting to do the following:
>> 6) If original value - Y - X > 0 then
>>  L0 resumes L2 with preemption timer = original value - Y - X
>> else
>>  L0 emulates a L2->L1 preemption timer exit (resumes L1)
> Yes, your description is right. But I'm also thinking about my
> previous consideration, why should we consider such X cycles as what
> L2 spent. For nested VMX. external interrupt is not provided by L1, it
> is triggered from L0 and want to cause periodically exit to L1, L2 is
> "accidentally injure" actually. Since these interrupts are not
> generated from L1 and not attend to affect L2, these cycles should not
> be treated as what L2 spent. Though these cycles are "spent" in view
> of L1, but they should not be taken into consideration in nested VMX.
> 
> For another example, if vcpu scheduled out when L0 handing such
> interrupts and CPU does some other things then schedule this vcpu
> again, these cycles of executing other processes should not be treated
> as what L2 spent definitely.

Think of your preemption timer test case: There you are indirectly
comparing the timer value against the TSC by checking the a preemption
timer exit happened after no more than n TSC cycles. But as the TSC L1
and L2 sees continued to tick while in L0, this test could now fail when
we leave out the L0 cycles.

An alternative would be to hide all L0 TSC cycles from the guest. But
that's not the way KVM works, independent of the preemption timer case.

BTW, you should use guest_read_tsc() on exit/entry of L2 in order to
calculate the time spent in L0. This will ensure that potential tweaks
of TSC_OFFSET that L0 might have applied in the meantime will be taken
into account.

Jan
Arthur Chunqi Li Aug. 25, 2013, 9:07 a.m. UTC | #21
On Sun, Aug 25, 2013 at 4:53 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2013-08-25 10:41, Arthur Chunqi Li wrote:
>> On Sun, Aug 25, 2013 at 4:18 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>>>
>>>
>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
>>>
>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
>>>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
>>>> Date: 25/08/2013 10:54 AM
>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>> timer
>>>> Sent by: kvm-owner@vger.kernel.org
>>>>
>>>> On 2013-08-25 09:50, Abel Gordon wrote:
>>>>>
>>>>>
>>>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>>>>>
>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>>>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>>>>>> Date: 25/08/2013 10:43 AM
>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>>>> timer
>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>
>>>>>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>>>>>
>>>>>>>
>>>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>>>>>> Date: 25/08/2013 09:44 AM
>>>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
>>> preemption
>>>>>>> timer
>>>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>>>
>>>>>>>> On 2013-08-24 20:44, root wrote:
>>>>>>>>> This patch contains the following two changes:
>>>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>>>>>> be save in such exits.
>>>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
>>> controls
>>>>>>>>> to nVMX.
>>>>>>>>>
>>>>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>>>>> supported.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>>>>> ---
>>>>>>>
>>>>>>>>>
>>>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>>>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>>>>>
>>>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>>>>>> +      if (vmcs12->vm_exit_controls &
>>>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>>>>>> +      else
>>>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>>>>>> +   }
>>>>>>>>
>>>>>>>> This is not correct. We still need to set the vmcs to
>>>>>>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>>>>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>>>>>> hardware state. The corresponding code is missing in your patch so
>>>>> far.
>>>>>>>
>>>>>>> I think something else maybe be missing here: assuming L0 handles
>>> exits
>>>>>>> for L2 without involving L1 (e.g. external interrupts or ept
>>>>> violations),
>>>>>>> then, we may spend some cycles in L0 handling these exits. Note L1 is
>>>>> not
>>>>>>> aware of these exits and from L1 perspective L2 was running on the
>>> CPU.
>>>>>>> That means that we may need to reduce these cycles spent at
>>>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>>>>>> force a transition to L1 instead of resuming L2.
>>>>>>
>>>>>> That's precisely what the logic I described should achieve: reload the
>>>>>> value we saved on L2 exit on reentry.
>>>>>
>>>>> But don't you think we should also reduce the cycles spent at L0 from
>>> the
>>>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
>>> exit
>>>>> which was not forwarded to L1, then, before we resume L2,
>>>>> the preemption timer should be: (previous_value_on_exit - X).
>>>>> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>>>>> preemption timer exit between L2 and L1.
>>>>
>>>> We ask the hardware to save the value of the preemption on L2 exit. This
>>>> value will be exposed to L1 (if it asked for saving as well) and/or be
>>>> written back to the hardware on L2 reenty (unless L1 had a chance to run
>>>> and modified it). So the time spent in L0 is implicitly subtracted.
>>>
>>> I think you are suggesting the following, please correct me if I am wrong.
>>> 1) L1 resumes L2 with preemption timer enabled
>>> 2) L0 emulates the resume/launch
>>> 3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
>>> timer specified by L1)
>>> 4) L0 saved the preemption timer (original value - Y)
>>> 5) L0 spends X cycles handling the external interrupt
>>> 6) L0 resumes L2 with preemption timer = original value - Y
>>>
>>> Note that in this case "X is ignored".
>>>
>>> I was suggesting to do the following:
>>> 6) If original value - Y - X > 0 then
>>>  L0 resumes L2 with preemption timer = original value - Y - X
>>> else
>>>  L0 emulates a L2->L1 preemption timer exit (resumes L1)
>> Yes, your description is right. But I'm also thinking about my
>> previous consideration, why should we consider such X cycles as what
>> L2 spent. For nested VMX. external interrupt is not provided by L1, it
>> is triggered from L0 and want to cause periodically exit to L1, L2 is
>> "accidentally injure" actually. Since these interrupts are not
>> generated from L1 and not attend to affect L2, these cycles should not
>> be treated as what L2 spent. Though these cycles are "spent" in view
>> of L1, but they should not be taken into consideration in nested VMX.
>>
>> For another example, if vcpu scheduled out when L0 handing such
>> interrupts and CPU does some other things then schedule this vcpu
>> again, these cycles of executing other processes should not be treated
>> as what L2 spent definitely.
>
> Think of your preemption timer test case: There you are indirectly
> comparing the timer value against the TSC by checking the a preemption
> timer exit happened after no more than n TSC cycles. But as the TSC L1
> and L2 sees continued to tick while in L0, this test could now fail when
> we leave out the L0 cycles.
>
> An alternative would be to hide all L0 TSC cycles from the guest. But
> that's not the way KVM works, independent of the preemption timer case.
>
> BTW, you should use guest_read_tsc() on exit/entry of L2 in order to
> calculate the time spent in L0. This will ensure that potential tweaks
> of TSC_OFFSET that L0 might have applied in the meantime will be taken
> into account.
Well, in this case, these X cycles is actually not in L1 and L2, but
it is treated that L2 consumes them, which seems like these cycles are
"stolen".

Arthur
>
> Jan
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Aug. 25, 2013, 9:08 a.m. UTC | #22
On 2013-08-25 11:07, Arthur Chunqi Li wrote:
> On Sun, Aug 25, 2013 at 4:53 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> On 2013-08-25 10:41, Arthur Chunqi Li wrote:
>>> On Sun, Aug 25, 2013 at 4:18 PM, Abel Gordon <ABELG@il.ibm.com> wrote:
>>>>
>>>>
>>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:54:13 AM:
>>>>
>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>>> Cc: gleb@redhat.com, kvm <kvm@vger.kernel.org>, pbonzini@redhat.com,
>>>>> "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>
>>>>> Date: 25/08/2013 10:54 AM
>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>>> timer
>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>
>>>>> On 2013-08-25 09:50, Abel Gordon wrote:
>>>>>>
>>>>>>
>>>>>> kvm-owner@vger.kernel.org wrote on 25/08/2013 10:43:12 AM:
>>>>>>
>>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>>> To: Abel Gordon/Haifa/IBM@IBMIL,
>>>>>>> Cc: gleb@redhat.com, kvm@vger.kernel.org, kvm-owner@vger.kernel.org,
>>>>>>> pbonzini@redhat.com, "??? <Arthur Chunqi Li>" <yzt356@gmail.com>
>>>>>>> Date: 25/08/2013 10:43 AM
>>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX preemption
>>>>>> timer
>>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>>
>>>>>>> On 2013-08-25 09:37, Abel Gordon wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> From: Jan Kiszka <jan.kiszka@web.de>
>>>>>>>>> To: "??? <Arthur Chunqi Li>"  <yzt356@gmail.com>,
>>>>>>>>> Cc: kvm@vger.kernel.org, gleb@redhat.com, pbonzini@redhat.com
>>>>>>>>> Date: 25/08/2013 09:44 AM
>>>>>>>>> Subject: Re: [PATCH] KVM: nVMX: Fully support of nested VMX
>>>> preemption
>>>>>>>> timer
>>>>>>>>> Sent by: kvm-owner@vger.kernel.org
>>>>>>>>>
>>>>>>>>> On 2013-08-24 20:44, root wrote:
>>>>>>>>>> This patch contains the following two changes:
>>>>>>>>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>>>>>>>>> with some reasons not emulated by L1, preemption timer value should
>>>>>>>>>> be save in such exits.
>>>>>>>>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit
>>>> controls
>>>>>>>>>> to nVMX.
>>>>>>>>>>
>>>>>>>>>> With this patch, nested VMX preemption timer features are fully
>>>>>>>>>> supported.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>>>>>>>>> ---
>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> @@ -7578,9 +7579,14 @@ static void prepare_vmcs02(struct kvm_vcpu
>>>>>>>>> *vcpu, struct vmcs12 *vmcs12)
>>>>>>>>>>        (vmcs_config.pin_based_exec_ctrl |
>>>>>>>>>>         vmcs12->pin_based_vm_exec_control));
>>>>>>>>>>
>>>>>>>>>> -   if (vmcs12->pin_based_vm_exec_control &
>>>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER)
>>>>>>>>>> -      vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>>>> -              vmcs12->vmx_preemption_timer_value);
>>>>>>>>>> +   if (vmcs12->pin_based_vm_exec_control &
>>>>>>>>> PIN_BASED_VMX_PREEMPTION_TIMER) {
>>>>>>>>>> +      if (vmcs12->vm_exit_controls &
>>>>>>>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>>>>>>>>>> +         vmcs12->vmx_preemption_timer_value =
>>>>>>>>>> +            vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>>>>>>>>> +      else
>>>>>>>>>> +         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>>>>>>>>>> +               vmcs12->vmx_preemption_timer_value);
>>>>>>>>>> +   }
>>>>>>>>>
>>>>>>>>> This is not correct. We still need to set the vmcs to
>>>>>>>>> vmx_preemption_timer_value. The difference is that, on exit from L2,
>>>>>>>>> vmx_preemption_timer_value has to be updated according to the saved
>>>>>>>>> hardware state. The corresponding code is missing in your patch so
>>>>>> far.
>>>>>>>>
>>>>>>>> I think something else maybe be missing here: assuming L0 handles
>>>> exits
>>>>>>>> for L2 without involving L1 (e.g. external interrupts or ept
>>>>>> violations),
>>>>>>>> then, we may spend some cycles in L0 handling these exits. Note L1 is
>>>>>> not
>>>>>>>> aware of these exits and from L1 perspective L2 was running on the
>>>> CPU.
>>>>>>>> That means that we may need to reduce these cycles spent at
>>>>>>>> L0 from the preemtion timer or emulate a preemption timer exit to
>>>>>>>> force a transition to L1 instead of resuming L2.
>>>>>>>
>>>>>>> That's precisely what the logic I described should achieve: reload the
>>>>>>> value we saved on L2 exit on reentry.
>>>>>>
>>>>>> But don't you think we should also reduce the cycles spent at L0 from
>>>> the
>>>>>> preemption timer ? I mean, if we spent X cycles at L0 handling a L2
>>>> exit
>>>>>> which was not forwarded to L1, then, before we resume L2,
>>>>>> the preemption timer should be: (previous_value_on_exit - X).
>>>>>> If (previous_value_on_exit - X) < 0, then we should force ("emulate") a
>>>>>> preemption timer exit between L2 and L1.
>>>>>
>>>>> We ask the hardware to save the value of the preemption on L2 exit. This
>>>>> value will be exposed to L1 (if it asked for saving as well) and/or be
>>>>> written back to the hardware on L2 reenty (unless L1 had a chance to run
>>>>> and modified it). So the time spent in L0 is implicitly subtracted.
>>>>
>>>> I think you are suggesting the following, please correct me if I am wrong.
>>>> 1) L1 resumes L2 with preemption timer enabled
>>>> 2) L0 emulates the resume/launch
>>>> 3) L2 runs for Y cycles until an external interrupt occurs (Y < preemption
>>>> timer specified by L1)
>>>> 4) L0 saved the preemption timer (original value - Y)
>>>> 5) L0 spends X cycles handling the external interrupt
>>>> 6) L0 resumes L2 with preemption timer = original value - Y
>>>>
>>>> Note that in this case "X is ignored".
>>>>
>>>> I was suggesting to do the following:
>>>> 6) If original value - Y - X > 0 then
>>>>  L0 resumes L2 with preemption timer = original value - Y - X
>>>> else
>>>>  L0 emulates a L2->L1 preemption timer exit (resumes L1)
>>> Yes, your description is right. But I'm also thinking about my
>>> previous consideration, why should we consider such X cycles as what
>>> L2 spent. For nested VMX. external interrupt is not provided by L1, it
>>> is triggered from L0 and want to cause periodically exit to L1, L2 is
>>> "accidentally injure" actually. Since these interrupts are not
>>> generated from L1 and not attend to affect L2, these cycles should not
>>> be treated as what L2 spent. Though these cycles are "spent" in view
>>> of L1, but they should not be taken into consideration in nested VMX.
>>>
>>> For another example, if vcpu scheduled out when L0 handing such
>>> interrupts and CPU does some other things then schedule this vcpu
>>> again, these cycles of executing other processes should not be treated
>>> as what L2 spent definitely.
>>
>> Think of your preemption timer test case: There you are indirectly
>> comparing the timer value against the TSC by checking the a preemption
>> timer exit happened after no more than n TSC cycles. But as the TSC L1
>> and L2 sees continued to tick while in L0, this test could now fail when
>> we leave out the L0 cycles.
>>
>> An alternative would be to hide all L0 TSC cycles from the guest. But
>> that's not the way KVM works, independent of the preemption timer case.
>>
>> BTW, you should use guest_read_tsc() on exit/entry of L2 in order to
>> calculate the time spent in L0. This will ensure that potential tweaks
>> of TSC_OFFSET that L0 might have applied in the meantime will be taken
>> into account.
> Well, in this case, these X cycles is actually not in L1 and L2, but
> it is treated that L2 consumes them, which seems like these cycles are
> "stolen".

Yes, they are stolen by L0 from L2.

Jan
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 57b4e12..9579409 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2204,7 +2204,8 @@  static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
 	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 				      VM_EXIT_LOAD_IA32_EFER);
 
@@ -7578,9 +7579,14 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		(vmcs_config.pin_based_exec_ctrl |
 		 vmcs12->pin_based_vm_exec_control));
 
-	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
-			     vmcs12->vmx_preemption_timer_value);
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) {
+		if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+			vmcs12->vmx_preemption_timer_value =
+				vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+		else
+			vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
+					vmcs12->vmx_preemption_timer_value);
+	}
 
 	/*
 	 * Whether page-faults are trapped is determined by a combination of
@@ -7690,7 +7696,11 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
 	 * bits are further modified by vmx_set_efer() below.
 	 */
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+		vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl |
+				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+	else
+		vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
 
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
@@ -7912,6 +7922,16 @@  static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	}
 
 	/*
+	 * If L2 support PIN_BASED_VMX_PREEMPTION_TIMER, L0 must support
+	 * VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
+	 */
+	if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
+			!(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	/*
 	 * We're finally done with prerequisite checking, and can start with
 	 * the nested entry.
 	 */