diff mbox

[v4] KVM: nVMX: Fully support of nested VMX preemption timer

Message ID 1378433091-18233-1-git-send-email-yzt356@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Arthur Chunqi Li Sept. 6, 2013, 2:04 a.m. UTC
This patch contains the following two changes:
1. Fix the bug in nested preemption timer support. If vmexit L2->L0
with some reasons not emulated by L1, preemption timer value should
be save in such exits.
2. Add support of "Save VMX-preemption timer value" VM-Exit controls
to nVMX.

With this patch, nested VMX preemption timer features are fully
supported.

Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
---
ChangeLog to v3:
Move nested_adjust_preemption_timer to the latest place just before vmenter.
Some minor changes.

 arch/x86/include/uapi/asm/msr-index.h |    1 +
 arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

Comments

Paolo Bonzini Sept. 13, 2013, 5:15 p.m. UTC | #1
Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
> This patch contains the following two changes:
> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> with some reasons not emulated by L1, preemption timer value should
> be save in such exits.
> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> to nVMX.
> 
> With this patch, nested VMX preemption timer features are fully
> supported.
> 
> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> ---
> ChangeLog to v3:
> Move nested_adjust_preemption_timer to the latest place just before vmenter.
> Some minor changes.
> 
>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
>  2 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
> index bb04650..b93e09a 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -536,6 +536,7 @@
>  
>  /* MSR_IA32_VMX_MISC bits */
>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>  /* AMD-V MSRs */
>  
>  #define MSR_VM_CR                       0xc0010114
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 1f1da43..f364d16 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -374,6 +374,8 @@ struct nested_vmx {
>  	 */
>  	struct page *apic_access_page;
>  	u64 msr_ia32_feature_control;
> +	/* Set if vmexit is L2->L1 */
> +	bool nested_vmx_exit;
>  };
>  
>  #define POSTED_INTR_ON  0
> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>  #ifdef CONFIG_X86_64
>  		VM_EXIT_HOST_ADDR_SPACE_SIZE |
>  #endif
> -		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> +		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	if (!(nested_vmx_pinbased_ctls_high &
> +				PIN_BASED_VMX_PREEMPTION_TIMER) ||
> +			!(nested_vmx_exit_ctls_high &
> +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {

Align this under the other "!".  Also, I prefer to have one long line
for the whole "!(... & ...) ||" (and likewise below), but I don't know
if Gleb agrees

> +		nested_vmx_exit_ctls_high &=
> +			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);

Please remove parentheses around ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, and
likewise elsewhere in the patch.

> +		nested_vmx_pinbased_ctls_high &=
> +			(~PIN_BASED_VMX_PREEMPTION_TIMER);
> +	}
>  	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>  				      VM_EXIT_LOAD_IA32_EFER);
>  
> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>  	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>  }
>  
> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
> +{
> +	u64 delta_tsc_l1;
> +	u32 preempt_val_l1, preempt_val_l2, preempt_scale;

Should this exit immediately if the preemption timer pin-based control
is disabled?

> +	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> +			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> +	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> +			native_read_tsc()) - vcpu->arch.last_guest_tsc;

Please format this like:

	delta_tsc_l1 =
		kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc())
		- vcpu->arch.last_guest_tsc;

> +	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> +	if (preempt_val_l2 <= preempt_val_l1)
> +		preempt_val_l2 = 0;
> +	else
> +		preempt_val_l2 -= preempt_val_l1;
> +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);

Did you test that a value of 0 triggers an immediate exit, rather than
counting down by 2^32?  Perhaps it's safer to limit the value to 1
instead of 0.

> +}
> +
>  /*
>   * The guest has exited.  See if we can fix it or if we need userspace
>   * assistance.
> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  		vmx->nested.nested_run_pending = 0;
>  
>  	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> +		vmx->nested.nested_vmx_exit = true;

I think this assignment should be in nested_vmx_vmexit, since it is
called from other places as well.

>  		nested_vmx_vmexit(vcpu);
>  		return 1;
>  	}
> +	vmx->nested.nested_vmx_exit = false;
>  
>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>  		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	debugctlmsr = get_debugctlmsr();
>  
>  	vmx->__launched = vmx->loaded_vmcs->launched;
> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
> +		nested_adjust_preemption_timer(vcpu);

Please leave the assignment to __launched last, since it's already
initializing the asm below.

I don't like the is_guest_mode check here... Maybe it's
micro-optimizing, but I wonder if we already do too many checks in
vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
to a check for "vmx->loaded_vmcs == &vmx->vmcs1".

Alternatively, we could change nested_vmx_exit to an enum in struct
vcpu_vmx (with values for L0->L1, L0->L2, L1->L2) that is initialized in
vmx_handle_exit.  Then we could check directly for L0->L2 and not adjust
the preemption timer in other cases.  In fact, I suspect this enum could
replace HF_GUEST_MASK altogether.  However, this would require some
other, more complicated, changes to svm.c.

Gleb, what do you think?

Paolo

>  	asm(
>  		/* Store host registers */
>  		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
> @@ -7518,6 +7552,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 exec_control;
> +	u32 exit_control;
>  
>  	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>  	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
> @@ -7691,7 +7726,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>  	 * bits are further modified by vmx_set_efer() below.
>  	 */
> -	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> +	exit_control = vmcs_config.vmexit_ctrl;
> +	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
> +		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>  
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>  	 * emulated by vmx_set_efer(), below.
> @@ -8090,6 +8128,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	vmcs12->guest_pending_dbg_exceptions =
>  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>  
> +	if ((vmcs12->pin_based_vm_exec_control &
> +				PIN_BASED_VMX_PREEMPTION_TIMER) &&
> +			(vmcs12->vm_exit_controls &
> +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> +		vmcs12->vmx_preemption_timer_value =
> +			vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +
>  	/*
>  	 * In some cases (usually, nested EPT), L2 is allowed to change its
>  	 * own CR3 without exiting. If it has changed it, we must keep it.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Sept. 14, 2013, 7:44 a.m. UTC | #2
On 2013-09-13 19:15, Paolo Bonzini wrote:
> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
>> +	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> +	if (preempt_val_l2 <= preempt_val_l1)
>> +		preempt_val_l2 = 0;
>> +	else
>> +		preempt_val_l2 -= preempt_val_l1;
>> +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
> 
> Did you test that a value of 0 triggers an immediate exit, rather than
> counting down by 2^32?  Perhaps it's safer to limit the value to 1
> instead of 0.

To my experience, 0 triggers immediate exists when the preemption timer
is enabled.

Jan
Arthur Chunqi Li Sept. 15, 2013, 7:59 a.m. UTC | #3
On Sat, Sep 14, 2013 at 3:44 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2013-09-13 19:15, Paolo Bonzini wrote:
>> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
>>> +    preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>>> +    if (preempt_val_l2 <= preempt_val_l1)
>>> +            preempt_val_l2 = 0;
>>> +    else
>>> +            preempt_val_l2 -= preempt_val_l1;
>>> +    vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
>>
>> Did you test that a value of 0 triggers an immediate exit, rather than
>> counting down by 2^32?  Perhaps it's safer to limit the value to 1
>> instead of 0.
>
> To my experience, 0 triggers immediate exists when the preemption timer
> is enabled.
Yes, L2 VM will exit immediately when the value is 0 with my patch.

Arthur
>
> Jan
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Sept. 15, 2013, 12:31 p.m. UTC | #4
On Fri, Sep 06, 2013 at 10:04:51AM +0800, Arthur Chunqi Li wrote:
> This patch contains the following two changes:
> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> with some reasons not emulated by L1, preemption timer value should
> be save in such exits.
> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> to nVMX.
> 
> With this patch, nested VMX preemption timer features are fully
> supported.
> 
> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> ---
> ChangeLog to v3:
> Move nested_adjust_preemption_timer to the latest place just before vmenter.
> Some minor changes.
> 
>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
>  2 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
> index bb04650..b93e09a 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -536,6 +536,7 @@
>  
>  /* MSR_IA32_VMX_MISC bits */
>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>  /* AMD-V MSRs */
>  
>  #define MSR_VM_CR                       0xc0010114
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 1f1da43..f364d16 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -374,6 +374,8 @@ struct nested_vmx {
>  	 */
>  	struct page *apic_access_page;
>  	u64 msr_ia32_feature_control;
> +	/* Set if vmexit is L2->L1 */
> +	bool nested_vmx_exit;
Do not see why it is needed, see bellow.

>  };
>  
>  #define POSTED_INTR_ON  0
> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>  #ifdef CONFIG_X86_64
>  		VM_EXIT_HOST_ADDR_SPACE_SIZE |
>  #endif
> -		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> +		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	if (!(nested_vmx_pinbased_ctls_high &
> +				PIN_BASED_VMX_PREEMPTION_TIMER) ||
> +			!(nested_vmx_exit_ctls_high &
> +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
> +		nested_vmx_exit_ctls_high &=
> +			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> +		nested_vmx_pinbased_ctls_high &=
> +			(~PIN_BASED_VMX_PREEMPTION_TIMER);
> +	}
>  	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>  				      VM_EXIT_LOAD_IA32_EFER);
>  
> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>  	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>  }
>  
> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
> +{
> +	u64 delta_tsc_l1;
> +	u32 preempt_val_l1, preempt_val_l2, preempt_scale;
> +
> +	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> +			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> +	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> +			native_read_tsc()) - vcpu->arch.last_guest_tsc;
> +	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> +	if (preempt_val_l2 <= preempt_val_l1)
> +		preempt_val_l2 = 0;
> +	else
> +		preempt_val_l2 -= preempt_val_l1;
> +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
> +}
> +
>  /*
>   * The guest has exited.  See if we can fix it or if we need userspace
>   * assistance.
> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  		vmx->nested.nested_run_pending = 0;
>  
>  	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> +		vmx->nested.nested_vmx_exit = true;
>  		nested_vmx_vmexit(vcpu);
>  		return 1;
>  	}
> +	vmx->nested.nested_vmx_exit = false;
>  
>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>  		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	debugctlmsr = get_debugctlmsr();
>  
>  	vmx->__launched = vmx->loaded_vmcs->launched;
> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
How is_guest_mode() and nested_vmx_exi can be both true? The only place
nested_vmx_exit is set to true is just before call to
nested_vmx_vmexit(). The firs thing nested_vmx_vmexit() does is makes
is_guest_mode() false. To enter guest mode again at least one other
vmexit from L1 to L0 is needed at which point nested_vmx_exit will be
reset to false again.

If you want to avoid calling nested_adjust_preemption_timer() during 
vmlaunch/vmresume emulation (and it looks like this is what you are
trying to achieve here) you can check nested_run_pending.


> +		nested_adjust_preemption_timer(vcpu);
>  	asm(
>  		/* Store host registers */
>  		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
> @@ -7518,6 +7552,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 exec_control;
> +	u32 exit_control;
>  
>  	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>  	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
> @@ -7691,7 +7726,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>  	 * bits are further modified by vmx_set_efer() below.
>  	 */
> -	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> +	exit_control = vmcs_config.vmexit_ctrl;
> +	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
> +		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>  
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>  	 * emulated by vmx_set_efer(), below.
> @@ -8090,6 +8128,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	vmcs12->guest_pending_dbg_exceptions =
>  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>  
> +	if ((vmcs12->pin_based_vm_exec_control &
> +				PIN_BASED_VMX_PREEMPTION_TIMER) &&
> +			(vmcs12->vm_exit_controls &
> +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> +		vmcs12->vmx_preemption_timer_value =
> +			vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +
>  	/*
>  	 * In some cases (usually, nested EPT), L2 is allowed to change its
>  	 * own CR3 without exiting. If it has changed it, we must keep it.
> -- 
> 1.7.9.5

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arthur Chunqi Li Sept. 16, 2013, 5:42 a.m. UTC | #5
On Sat, Sep 14, 2013 at 1:15 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
>> This patch contains the following two changes:
>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> with some reasons not emulated by L1, preemption timer value should
>> be save in such exits.
>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> to nVMX.
>>
>> With this patch, nested VMX preemption timer features are fully
>> supported.
>>
>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> ---
>> ChangeLog to v3:
>> Move nested_adjust_preemption_timer to the latest place just before vmenter.
>> Some minor changes.
>>
>>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
>> index bb04650..b93e09a 100644
>> --- a/arch/x86/include/uapi/asm/msr-index.h
>> +++ b/arch/x86/include/uapi/asm/msr-index.h
>> @@ -536,6 +536,7 @@
>>
>>  /* MSR_IA32_VMX_MISC bits */
>>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
>> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>>  /* AMD-V MSRs */
>>
>>  #define MSR_VM_CR                       0xc0010114
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 1f1da43..f364d16 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -374,6 +374,8 @@ struct nested_vmx {
>>        */
>>       struct page *apic_access_page;
>>       u64 msr_ia32_feature_control;
>> +     /* Set if vmexit is L2->L1 */
>> +     bool nested_vmx_exit;
>>  };
>>
>>  #define POSTED_INTR_ON  0
>> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>  #ifdef CONFIG_X86_64
>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>  #endif
>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     if (!(nested_vmx_pinbased_ctls_high &
>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) ||
>> +                     !(nested_vmx_exit_ctls_high &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>
> Align this under the other "!".  Also, I prefer to have one long line
> for the whole "!(... & ...) ||" (and likewise below), but I don't know
> if Gleb agrees
>
>> +             nested_vmx_exit_ctls_high &=
>> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>
> Please remove parentheses around ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, and
> likewise elsewhere in the patch.
>
>> +             nested_vmx_pinbased_ctls_high &=
>> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> +     }
>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>                                     VM_EXIT_LOAD_IA32_EFER);
>>
>> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>>  }
>>
>> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
>> +{
>> +     u64 delta_tsc_l1;
>> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>
> Should this exit immediately if the preemption timer pin-based control
> is disabled?
Hi Paolo,
How can I get pin-based control here from "struct kvm_vcpu *vcpu"?

Arthur
>
>> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>> +                     MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
>
> Please format this like:
>
>         delta_tsc_l1 =
>                 kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc())
>                 - vcpu->arch.last_guest_tsc;
>
>> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> +     if (preempt_val_l2 <= preempt_val_l1)
>> +             preempt_val_l2 = 0;
>> +     else
>> +             preempt_val_l2 -= preempt_val_l1;
>> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
>
> Did you test that a value of 0 triggers an immediate exit, rather than
> counting down by 2^32?  Perhaps it's safer to limit the value to 1
> instead of 0.
>
>> +}
>> +
>>  /*
>>   * The guest has exited.  See if we can fix it or if we need userspace
>>   * assistance.
>> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>               vmx->nested.nested_run_pending = 0;
>>
>>       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
>> +             vmx->nested.nested_vmx_exit = true;
>
> I think this assignment should be in nested_vmx_vmexit, since it is
> called from other places as well.
>
>>               nested_vmx_vmexit(vcpu);
>>               return 1;
>>       }
>> +     vmx->nested.nested_vmx_exit = false;
>>
>>       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>>               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
>> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>       debugctlmsr = get_debugctlmsr();
>>
>>       vmx->__launched = vmx->loaded_vmcs->launched;
>> +     if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
>> +             nested_adjust_preemption_timer(vcpu);
>
> Please leave the assignment to __launched last, since it's already
> initializing the asm below.
>
> I don't like the is_guest_mode check here... Maybe it's
> micro-optimizing, but I wonder if we already do too many checks in
> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
>
> Alternatively, we could change nested_vmx_exit to an enum in struct
> vcpu_vmx (with values for L0->L1, L0->L2, L1->L2) that is initialized in
> vmx_handle_exit.  Then we could check directly for L0->L2 and not adjust
> the preemption timer in other cases.  In fact, I suspect this enum could
> replace HF_GUEST_MASK altogether.  However, this would require some
> other, more complicated, changes to svm.c.
>
> Gleb, what do you think?
>
> Paolo
>
>>       asm(
>>               /* Store host registers */
>>               "push %%" _ASM_DX "; push %%" _ASM_BP ";"
>> @@ -7518,6 +7552,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  {
>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>       u32 exec_control;
>> +     u32 exit_control;
>>
>>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
>> @@ -7691,7 +7726,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>        * bits are further modified by vmx_set_efer() below.
>>        */
>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> +     exit_control = vmcs_config.vmexit_ctrl;
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>> +             exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>>
>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>        * emulated by vmx_set_efer(), below.
>> @@ -8090,6 +8128,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>       vmcs12->guest_pending_dbg_exceptions =
>>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>
>> +     if ((vmcs12->pin_based_vm_exec_control &
>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) &&
>> +                     (vmcs12->vm_exit_controls &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> +             vmcs12->vmx_preemption_timer_value =
>> +                     vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +
>>       /*
>>        * In some cases (usually, nested EPT), L2 is allowed to change its
>>        * own CR3 without exiting. If it has changed it, we must keep it.
>>
>
Arthur Chunqi Li Sept. 16, 2013, 5:49 a.m. UTC | #6
On Sun, Sep 15, 2013 at 8:31 PM, Gleb Natapov <gleb@redhat.com> wrote:
> On Fri, Sep 06, 2013 at 10:04:51AM +0800, Arthur Chunqi Li wrote:
>> This patch contains the following two changes:
>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> with some reasons not emulated by L1, preemption timer value should
>> be save in such exits.
>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> to nVMX.
>>
>> With this patch, nested VMX preemption timer features are fully
>> supported.
>>
>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> ---
>> ChangeLog to v3:
>> Move nested_adjust_preemption_timer to the latest place just before vmenter.
>> Some minor changes.
>>
>>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
>> index bb04650..b93e09a 100644
>> --- a/arch/x86/include/uapi/asm/msr-index.h
>> +++ b/arch/x86/include/uapi/asm/msr-index.h
>> @@ -536,6 +536,7 @@
>>
>>  /* MSR_IA32_VMX_MISC bits */
>>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
>> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>>  /* AMD-V MSRs */
>>
>>  #define MSR_VM_CR                       0xc0010114
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 1f1da43..f364d16 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -374,6 +374,8 @@ struct nested_vmx {
>>        */
>>       struct page *apic_access_page;
>>       u64 msr_ia32_feature_control;
>> +     /* Set if vmexit is L2->L1 */
>> +     bool nested_vmx_exit;
> Do not see why it is needed, see bellow.
>
>>  };
>>
>>  #define POSTED_INTR_ON  0
>> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>  #ifdef CONFIG_X86_64
>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>  #endif
>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     if (!(nested_vmx_pinbased_ctls_high &
>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) ||
>> +                     !(nested_vmx_exit_ctls_high &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>> +             nested_vmx_exit_ctls_high &=
>> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> +             nested_vmx_pinbased_ctls_high &=
>> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> +     }
>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>                                     VM_EXIT_LOAD_IA32_EFER);
>>
>> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>>  }
>>
>> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
>> +{
>> +     u64 delta_tsc_l1;
>> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>> +
>> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>> +                     MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
>> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> +     if (preempt_val_l2 <= preempt_val_l1)
>> +             preempt_val_l2 = 0;
>> +     else
>> +             preempt_val_l2 -= preempt_val_l1;
>> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
>> +}
>> +
>>  /*
>>   * The guest has exited.  See if we can fix it or if we need userspace
>>   * assistance.
>> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>               vmx->nested.nested_run_pending = 0;
>>
>>       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
>> +             vmx->nested.nested_vmx_exit = true;
>>               nested_vmx_vmexit(vcpu);
>>               return 1;
>>       }
>> +     vmx->nested.nested_vmx_exit = false;
>>
>>       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>>               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
>> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>       debugctlmsr = get_debugctlmsr();
>>
>>       vmx->__launched = vmx->loaded_vmcs->launched;
>> +     if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
> How is_guest_mode() and nested_vmx_exi can be both true? The only place
> nested_vmx_exit is set to true is just before call to
> nested_vmx_vmexit(). The firs thing nested_vmx_vmexit() does is makes
> is_guest_mode() false. To enter guest mode again at least one other
> vmexit from L1 to L0 is needed at which point nested_vmx_exit will be
> reset to false again.
>
> If you want to avoid calling nested_adjust_preemption_timer() during
> vmlaunch/vmresume emulation (and it looks like this is what you are
> trying to achieve here) you can check nested_run_pending.
Besides vmlaunch/vmresume emulation, every exit from L2->L1 should not
call nested_adjust_preemption_timer(), this function is just used to
adjust preemption timer when L2->L0->L2. Can nested_run_pending
distinguish this?

Arthur
>
>
>> +             nested_adjust_preemption_timer(vcpu);
>>       asm(
>>               /* Store host registers */
>>               "push %%" _ASM_DX "; push %%" _ASM_BP ";"
>> @@ -7518,6 +7552,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  {
>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>       u32 exec_control;
>> +     u32 exit_control;
>>
>>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
>> @@ -7691,7 +7726,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>        * bits are further modified by vmx_set_efer() below.
>>        */
>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> +     exit_control = vmcs_config.vmexit_ctrl;
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>> +             exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>>
>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>        * emulated by vmx_set_efer(), below.
>> @@ -8090,6 +8128,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>       vmcs12->guest_pending_dbg_exceptions =
>>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>
>> +     if ((vmcs12->pin_based_vm_exec_control &
>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) &&
>> +                     (vmcs12->vm_exit_controls &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> +             vmcs12->vmx_preemption_timer_value =
>> +                     vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +
>>       /*
>>        * In some cases (usually, nested EPT), L2 is allowed to change its
>>        * own CR3 without exiting. If it has changed it, we must keep it.
>> --
>> 1.7.9.5
>
> --
>                         Gleb.
Gleb Natapov Sept. 16, 2013, 6:20 a.m. UTC | #7
On Mon, Sep 16, 2013 at 01:49:41PM +0800, Arthur Chunqi Li wrote:
> On Sun, Sep 15, 2013 at 8:31 PM, Gleb Natapov <gleb@redhat.com> wrote:
> > On Fri, Sep 06, 2013 at 10:04:51AM +0800, Arthur Chunqi Li wrote:
> >> This patch contains the following two changes:
> >> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> >> with some reasons not emulated by L1, preemption timer value should
> >> be save in such exits.
> >> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> >> to nVMX.
> >>
> >> With this patch, nested VMX preemption timer features are fully
> >> supported.
> >>
> >> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >> ---
> >> ChangeLog to v3:
> >> Move nested_adjust_preemption_timer to the latest place just before vmenter.
> >> Some minor changes.
> >>
> >>  arch/x86/include/uapi/asm/msr-index.h |    1 +
> >>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
> >>  2 files changed, 48 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
> >> index bb04650..b93e09a 100644
> >> --- a/arch/x86/include/uapi/asm/msr-index.h
> >> +++ b/arch/x86/include/uapi/asm/msr-index.h
> >> @@ -536,6 +536,7 @@
> >>
> >>  /* MSR_IA32_VMX_MISC bits */
> >>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
> >> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
> >>  /* AMD-V MSRs */
> >>
> >>  #define MSR_VM_CR                       0xc0010114
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index 1f1da43..f364d16 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -374,6 +374,8 @@ struct nested_vmx {
> >>        */
> >>       struct page *apic_access_page;
> >>       u64 msr_ia32_feature_control;
> >> +     /* Set if vmexit is L2->L1 */
> >> +     bool nested_vmx_exit;
> > Do not see why it is needed, see bellow.
> >
> >>  };
> >>
> >>  #define POSTED_INTR_ON  0
> >> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
> >>  #ifdef CONFIG_X86_64
> >>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
> >>  #endif
> >> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> >> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> >> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> >> +     if (!(nested_vmx_pinbased_ctls_high &
> >> +                             PIN_BASED_VMX_PREEMPTION_TIMER) ||
> >> +                     !(nested_vmx_exit_ctls_high &
> >> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
> >> +             nested_vmx_exit_ctls_high &=
> >> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> >> +             nested_vmx_pinbased_ctls_high &=
> >> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
> >> +     }
> >>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
> >>                                     VM_EXIT_LOAD_IA32_EFER);
> >>
> >> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> >>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
> >>  }
> >>
> >> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
> >> +{
> >> +     u64 delta_tsc_l1;
> >> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
> >> +
> >> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> >> +                     MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> >> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> >> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
> >> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> >> +     if (preempt_val_l2 <= preempt_val_l1)
> >> +             preempt_val_l2 = 0;
> >> +     else
> >> +             preempt_val_l2 -= preempt_val_l1;
> >> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
> >> +}
> >> +
> >>  /*
> >>   * The guest has exited.  See if we can fix it or if we need userspace
> >>   * assistance.
> >> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >>               vmx->nested.nested_run_pending = 0;
> >>
> >>       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> >> +             vmx->nested.nested_vmx_exit = true;
> >>               nested_vmx_vmexit(vcpu);
> >>               return 1;
> >>       }
> >> +     vmx->nested.nested_vmx_exit = false;
> >>
> >>       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
> >>               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> >> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >>       debugctlmsr = get_debugctlmsr();
> >>
> >>       vmx->__launched = vmx->loaded_vmcs->launched;
> >> +     if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
> > How is_guest_mode() and nested_vmx_exi can be both true? The only place
> > nested_vmx_exit is set to true is just before call to
> > nested_vmx_vmexit(). The firs thing nested_vmx_vmexit() does is makes
> > is_guest_mode() false. To enter guest mode again at least one other
> > vmexit from L1 to L0 is needed at which point nested_vmx_exit will be
> > reset to false again.
> >
> > If you want to avoid calling nested_adjust_preemption_timer() during
> > vmlaunch/vmresume emulation (and it looks like this is what you are
> > trying to achieve here) you can check nested_run_pending.
> Besides vmlaunch/vmresume emulation, every exit from L2->L1 should not
> call nested_adjust_preemption_timer(), this function is just used to
> adjust preemption timer when L2->L0->L2. Can nested_run_pending
> distinguish this?
> 
On L2->L1 exit is_guest_mode() will be false during vmx_vcpu_run().
On L1->L2 entry nested_run_pending will be true. On L0->L2 entry
is_guest_mode() == true && nested_run_pending == false.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Sept. 16, 2013, 7:18 a.m. UTC | #8
On 2013-09-16 07:42, Arthur Chunqi Li wrote:
> On Sat, Sep 14, 2013 at 1:15 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
>>> This patch contains the following two changes:
>>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>>> with some reasons not emulated by L1, preemption timer value should
>>> be save in such exits.
>>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>>> to nVMX.
>>>
>>> With this patch, nested VMX preemption timer features are fully
>>> supported.
>>>
>>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>>> ---
>>> ChangeLog to v3:
>>> Move nested_adjust_preemption_timer to the latest place just before vmenter.
>>> Some minor changes.
>>>
>>>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>>>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
>>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
>>> index bb04650..b93e09a 100644
>>> --- a/arch/x86/include/uapi/asm/msr-index.h
>>> +++ b/arch/x86/include/uapi/asm/msr-index.h
>>> @@ -536,6 +536,7 @@
>>>
>>>  /* MSR_IA32_VMX_MISC bits */
>>>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
>>> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>>>  /* AMD-V MSRs */
>>>
>>>  #define MSR_VM_CR                       0xc0010114
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index 1f1da43..f364d16 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -374,6 +374,8 @@ struct nested_vmx {
>>>        */
>>>       struct page *apic_access_page;
>>>       u64 msr_ia32_feature_control;
>>> +     /* Set if vmexit is L2->L1 */
>>> +     bool nested_vmx_exit;
>>>  };
>>>
>>>  #define POSTED_INTR_ON  0
>>> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>>  #ifdef CONFIG_X86_64
>>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>>  #endif
>>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>>> +     if (!(nested_vmx_pinbased_ctls_high &
>>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) ||
>>> +                     !(nested_vmx_exit_ctls_high &
>>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>>
>> Align this under the other "!".  Also, I prefer to have one long line
>> for the whole "!(... & ...) ||" (and likewise below), but I don't know
>> if Gleb agrees
>>
>>> +             nested_vmx_exit_ctls_high &=
>>> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>>
>> Please remove parentheses around ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, and
>> likewise elsewhere in the patch.
>>
>>> +             nested_vmx_pinbased_ctls_high &=
>>> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>>> +     }
>>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>>                                     VM_EXIT_LOAD_IA32_EFER);
>>>
>>> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>>>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>>>  }
>>>
>>> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
>>> +{
>>> +     u64 delta_tsc_l1;
>>> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>>
>> Should this exit immediately if the preemption timer pin-based control
>> is disabled?
> Hi Paolo,
> How can I get pin-based control here from "struct kvm_vcpu *vcpu"?

You can find get_vmcs12(vcpu)->pin_based_vm_exec_control in existing code.

Jan
Gleb Natapov Sept. 16, 2013, 7:44 a.m. UTC | #9
On Fri, Sep 13, 2013 at 07:15:11PM +0200, Paolo Bonzini wrote:
> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
> > This patch contains the following two changes:
> > 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> > with some reasons not emulated by L1, preemption timer value should
> > be save in such exits.
> > 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> > to nVMX.
> > 
> > With this patch, nested VMX preemption timer features are fully
> > supported.
> > 
> > Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> > ---
> > ChangeLog to v3:
> > Move nested_adjust_preemption_timer to the latest place just before vmenter.
> > Some minor changes.
> > 
> >  arch/x86/include/uapi/asm/msr-index.h |    1 +
> >  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
> >  2 files changed, 48 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
> > index bb04650..b93e09a 100644
> > --- a/arch/x86/include/uapi/asm/msr-index.h
> > +++ b/arch/x86/include/uapi/asm/msr-index.h
> > @@ -536,6 +536,7 @@
> >  
> >  /* MSR_IA32_VMX_MISC bits */
> >  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
> > +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
> >  /* AMD-V MSRs */
> >  
> >  #define MSR_VM_CR                       0xc0010114
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 1f1da43..f364d16 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -374,6 +374,8 @@ struct nested_vmx {
> >  	 */
> >  	struct page *apic_access_page;
> >  	u64 msr_ia32_feature_control;
> > +	/* Set if vmexit is L2->L1 */
> > +	bool nested_vmx_exit;
> >  };
> >  
> >  #define POSTED_INTR_ON  0
> > @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
> >  #ifdef CONFIG_X86_64
> >  		VM_EXIT_HOST_ADDR_SPACE_SIZE |
> >  #endif
> > -		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> > +		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> > +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> > +	if (!(nested_vmx_pinbased_ctls_high &
> > +				PIN_BASED_VMX_PREEMPTION_TIMER) ||
> > +			!(nested_vmx_exit_ctls_high &
> > +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
> 
> Align this under the other "!".  Also, I prefer to have one long line
> for the whole "!(... & ...) ||" (and likewise below), but I don't know
> if Gleb agrees
> 
!(... & ...) ||
!(... & ...)

fits perfectly to 80 chars.

> > +		nested_vmx_exit_ctls_high &=
> > +			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> 
> Please remove parentheses around ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, and
> likewise elsewhere in the patch.
> 
and line break too.

> > +		nested_vmx_pinbased_ctls_high &=
> > +			(~PIN_BASED_VMX_PREEMPTION_TIMER);
> > +	}
> >  	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
> >  				      VM_EXIT_LOAD_IA32_EFER);
> >  
> > @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> >  	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
> >  }
> >  
> > +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
> > +{
> > +	u64 delta_tsc_l1;
> > +	u32 preempt_val_l1, preempt_val_l2, preempt_scale;
> 
> Should this exit immediately if the preemption timer pin-based control
> is disabled?
> 
> > +	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> > +			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> > +	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> > +	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> > +			native_read_tsc()) - vcpu->arch.last_guest_tsc;
> 
> Please format this like:
> 
> 	delta_tsc_l1 =
> 		kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc())
> 		- vcpu->arch.last_guest_tsc;
> 
And call vmx_read_l1_tsc() directly. Actually you can even use
to_vmx(vcpu)->nested.vmcs01_tsc_offset directly here since the function
will be called only when is_guest_mode() == true, but vmx_read_l1_tsc()
may be more clear here and compile should optimize second is_guest_mode()
check anyway.

> > +	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> > +	if (preempt_val_l2 <= preempt_val_l1)
> > +		preempt_val_l2 = 0;
> > +	else
> > +		preempt_val_l2 -= preempt_val_l1;
> > +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
> 
> Did you test that a value of 0 triggers an immediate exit, rather than
> counting down by 2^32?  Perhaps it's safer to limit the value to 1
> instead of 0.
> 
> > +}
> > +
> >  /*
> >   * The guest has exited.  See if we can fix it or if we need userspace
> >   * assistance.
> > @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >  		vmx->nested.nested_run_pending = 0;
> >  
> >  	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> > +		vmx->nested.nested_vmx_exit = true;
> 
> I think this assignment should be in nested_vmx_vmexit, since it is
> called from other places as well.
> 
> >  		nested_vmx_vmexit(vcpu);
> >  		return 1;
> >  	}
> > +	vmx->nested.nested_vmx_exit = false;
> >  
> >  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
> >  		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> > @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >  	debugctlmsr = get_debugctlmsr();
> >  
> >  	vmx->__launched = vmx->loaded_vmcs->launched;
> > +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
> > +		nested_adjust_preemption_timer(vcpu);
> 
> Please leave the assignment to __launched last, since it's already
> initializing the asm below.
> 
> I don't like the is_guest_mode check here... Maybe it's
> micro-optimizing, but I wonder if we already do too many checks in
> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
> 
Why this will be more efficient that HF_GUEST_MASK check?

> Alternatively, we could change nested_vmx_exit to an enum in struct
> vcpu_vmx (with values for L0->L1, L0->L2, L1->L2) that is initialized in
> vmx_handle_exit.  Then we could check directly for L0->L2 and not adjust
> the preemption timer in other cases.  In fact, I suspect this enum could
> replace HF_GUEST_MASK altogether.  However, this would require some
> other, more complicated, changes to svm.c.
> 
> Gleb, what do you think?
> 
I do not see why nested_vmx_exit is necessary at all yet. We can detect
all aforementioned cases without.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 16, 2013, 8:58 a.m. UTC | #10
Il 16/09/2013 09:44, Gleb Natapov ha scritto:
> On Fri, Sep 13, 2013 at 07:15:11PM +0200, Paolo Bonzini wrote:
>> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
>>> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
>>> +{
>>> +	u64 delta_tsc_l1;
>>> +	u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>>
>> Should this exit immediately if the preemption timer pin-based control
>> is disabled?
>>
>>> +	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>>> +			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>>> +	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>>> +	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>>> +			native_read_tsc()) - vcpu->arch.last_guest_tsc;
>>
>> Please format this like:
>>
>> 	delta_tsc_l1 =
>> 		kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc())
>> 		- vcpu->arch.last_guest_tsc;
>>
> And call vmx_read_l1_tsc() directly. Actually you can even use
> to_vmx(vcpu)->nested.vmcs01_tsc_offset directly here since the function
> will be called only when is_guest_mode() == true, but vmx_read_l1_tsc()
> may be more clear here and compile should optimize second is_guest_mode()
> check anyway.

Right.  Though I'm not that concerned about optimizing L1->L2 and L0->L2
entries; I'm more concerned about not slowing down L0->L1 (which
includes the non-nested case, of course).

>>> +	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>>> +	if (preempt_val_l2 <= preempt_val_l1)
>>> +		preempt_val_l2 = 0;
>>> +	else
>>> +		preempt_val_l2 -= preempt_val_l1;
>>> +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
>>
>> Did you test that a value of 0 triggers an immediate exit, rather than
>> counting down by 2^32?  Perhaps it's safer to limit the value to 1
>> instead of 0.
>>
>>> +}
>>> +
>>>  /*
>>>   * The guest has exited.  See if we can fix it or if we need userspace
>>>   * assistance.
>>> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>>  		vmx->nested.nested_run_pending = 0;
>>>  
>>>  	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
>>> +		vmx->nested.nested_vmx_exit = true;
>>
>> I think this assignment should be in nested_vmx_vmexit, since it is
>> called from other places as well.
>>
>>>  		nested_vmx_vmexit(vcpu);
>>>  		return 1;
>>>  	}
>>> +	vmx->nested.nested_vmx_exit = false;
>>>  
>>>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>>>  		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
>>> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>>  	debugctlmsr = get_debugctlmsr();
>>>  
>>>  	vmx->__launched = vmx->loaded_vmcs->launched;
>>> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
>>> +		nested_adjust_preemption_timer(vcpu);
>>
>> Please leave the assignment to __launched last, since it's already
>> initializing the asm below.
>>
>> I don't like the is_guest_mode check here... Maybe it's
>> micro-optimizing, but I wonder if we already do too many checks in
>> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
>> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
>>
> Why this will be more efficient that HF_GUEST_MASK check?

Because we have already loaded vmx->loaded_vmcs, so it's one memory
access less.

>> Alternatively, we could change nested_vmx_exit to an enum in struct
>> vcpu_vmx (with values for L0->L1, L0->L2, L1->L2) that is initialized in
>> vmx_handle_exit.  Then we could check directly for L0->L2 and not adjust
>> the preemption timer in other cases.  In fact, I suspect this enum could
>> replace HF_GUEST_MASK altogether.  However, this would require some
>> other, more complicated, changes to svm.c.
>>
>> Gleb, what do you think?
>>
> I do not see why nested_vmx_exit is necessary at all yet. We can detect
> all aforementioned cases without.

How do you distinguish L0->L2 from L1->L2 in vmx_vcpu_run?

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Sept. 16, 2013, 9:09 a.m. UTC | #11
On Mon, Sep 16, 2013 at 10:58:12AM +0200, Paolo Bonzini wrote:
> >>>  	vmx->__launched = vmx->loaded_vmcs->launched;
> >>> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
> >>> +		nested_adjust_preemption_timer(vcpu);
> >>
> >> Please leave the assignment to __launched last, since it's already
> >> initializing the asm below.
> >>
> >> I don't like the is_guest_mode check here... Maybe it's
> >> micro-optimizing, but I wonder if we already do too many checks in
> >> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
> >> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
> >>
> > Why this will be more efficient that HF_GUEST_MASK check?
> 
> Because we have already loaded vmx->loaded_vmcs, so it's one memory
> access less.
> 
But we will have to load vmx->vmcs1 instead :) Looks like very minor
optimization if at all. If we could avoid additional if() at all
somehow, may be mimicking vcpu->requests in vmx to get rid of most ifs
there.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 16, 2013, 9:49 a.m. UTC | #12
Il 16/09/2013 11:09, Gleb Natapov ha scritto:
> On Mon, Sep 16, 2013 at 10:58:12AM +0200, Paolo Bonzini wrote:
>>>>>  	vmx->__launched = vmx->loaded_vmcs->launched;
>>>>> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
>>>>> +		nested_adjust_preemption_timer(vcpu);
>>>>
>>>> Please leave the assignment to __launched last, since it's already
>>>> initializing the asm below.
>>>>
>>>> I don't like the is_guest_mode check here... Maybe it's
>>>> micro-optimizing, but I wonder if we already do too many checks in
>>>> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
>>>> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
>>>>
>>> Why this will be more efficient that HF_GUEST_MASK check?
>>
>> Because we have already loaded vmx->loaded_vmcs, so it's one memory
>> access less.
>>
> But we will have to load vmx->vmcs1 instead :)

That's not a memory load, it's an add.

> Looks like very minor
> optimization if at all. If we could avoid additional if() at all
> somehow, may be mimicking vcpu->requests in vmx to get rid of most ifs
> there.

Yes.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Sept. 16, 2013, 10:44 a.m. UTC | #13
On Mon, Sep 16, 2013 at 11:49:40AM +0200, Paolo Bonzini wrote:
> Il 16/09/2013 11:09, Gleb Natapov ha scritto:
> > On Mon, Sep 16, 2013 at 10:58:12AM +0200, Paolo Bonzini wrote:
> >>>>>  	vmx->__launched = vmx->loaded_vmcs->launched;
> >>>>> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
> >>>>> +		nested_adjust_preemption_timer(vcpu);
> >>>>
> >>>> Please leave the assignment to __launched last, since it's already
> >>>> initializing the asm below.
> >>>>
> >>>> I don't like the is_guest_mode check here... Maybe it's
> >>>> micro-optimizing, but I wonder if we already do too many checks in
> >>>> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
> >>>> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
> >>>>
> >>> Why this will be more efficient that HF_GUEST_MASK check?
> >>
> >> Because we have already loaded vmx->loaded_vmcs, so it's one memory
> >> access less.
> >>
> > But we will have to load vmx->vmcs1 instead :)
> 
> That's not a memory load, it's an add.
> 
You assume that vmx->loaded_vmcs and vmx will be both in registers here,
isn't it too much to assume?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 16, 2013, 10:50 a.m. UTC | #14
Il 16/09/2013 12:44, Gleb Natapov ha scritto:
> On Mon, Sep 16, 2013 at 11:49:40AM +0200, Paolo Bonzini wrote:
>> Il 16/09/2013 11:09, Gleb Natapov ha scritto:
>>> On Mon, Sep 16, 2013 at 10:58:12AM +0200, Paolo Bonzini wrote:
>>>>>>>  	vmx->__launched = vmx->loaded_vmcs->launched;
>>>>>>> +	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
>>>>>>> +		nested_adjust_preemption_timer(vcpu);
>>>>>>
>>>>>> Please leave the assignment to __launched last, since it's already
>>>>>> initializing the asm below.
>>>>>>
>>>>>> I don't like the is_guest_mode check here... Maybe it's
>>>>>> micro-optimizing, but I wonder if we already do too many checks in
>>>>>> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
>>>>>> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
>>>>>>
>>>>> Why this will be more efficient that HF_GUEST_MASK check?
>>>>
>>>> Because we have already loaded vmx->loaded_vmcs, so it's one memory
>>>> access less.
>>>>
>>> But we will have to load vmx->vmcs1 instead :)
>>
>> That's not a memory load, it's an add.
>>
> You assume that vmx->loaded_vmcs and vmx will be both in registers here,
> isn't it too much to assume?

Both of them are needed in this very close line:

    vmx->__launched = vmx->loaded_vmcs->launched;

So it is not a big assumption---even on 32-bits, perhaps.  By contrast,
vcpu->hflags is very unlikely to be in a register.

This is of course mostly pointless alone.  But if we could shave 100
cycles off a lightweight vmexit, that could give a measurable
improvement on nested virt.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index bb04650..b93e09a 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -536,6 +536,7 @@ 
 
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
 
 #define MSR_VM_CR                       0xc0010114
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1f1da43..f364d16 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -374,6 +374,8 @@  struct nested_vmx {
 	 */
 	struct page *apic_access_page;
 	u64 msr_ia32_feature_control;
+	/* Set if vmexit is L2->L1 */
+	bool nested_vmx_exit;
 };
 
 #define POSTED_INTR_ON  0
@@ -2204,7 +2206,17 @@  static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+	if (!(nested_vmx_pinbased_ctls_high &
+				PIN_BASED_VMX_PREEMPTION_TIMER) ||
+			!(nested_vmx_exit_ctls_high &
+				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+		nested_vmx_exit_ctls_high &=
+			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+		nested_vmx_pinbased_ctls_high &=
+			(~PIN_BASED_VMX_PREEMPTION_TIMER);
+	}
 	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 				      VM_EXIT_LOAD_IA32_EFER);
 
@@ -6707,6 +6719,24 @@  static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	u64 delta_tsc_l1;
+	u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+
+	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
+			native_read_tsc()) - vcpu->arch.last_guest_tsc;
+	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+	if (preempt_val_l2 <= preempt_val_l1)
+		preempt_val_l2 = 0;
+	else
+		preempt_val_l2 -= preempt_val_l1;
+	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -6736,9 +6766,11 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		vmx->nested.nested_run_pending = 0;
 
 	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
+		vmx->nested.nested_vmx_exit = true;
 		nested_vmx_vmexit(vcpu);
 		return 1;
 	}
+	vmx->nested.nested_vmx_exit = false;
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -7132,6 +7164,8 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	debugctlmsr = get_debugctlmsr();
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
+	if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
+		nested_adjust_preemption_timer(vcpu);
 	asm(
 		/* Store host registers */
 		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -7518,6 +7552,7 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
+	u32 exit_control;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7691,7 +7726,10 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
 	 * bits are further modified by vmx_set_efer() below.
 	 */
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+	exit_control = vmcs_config.vmexit_ctrl;
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
 
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
@@ -8090,6 +8128,13 @@  static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+	if ((vmcs12->pin_based_vm_exec_control &
+				PIN_BASED_VMX_PREEMPTION_TIMER) &&
+			(vmcs12->vm_exit_controls &
+				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+		vmcs12->vmx_preemption_timer_value =
+			vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+
 	/*
 	 * In some cases (usually, nested EPT), L2 is allowed to change its
 	 * own CR3 without exiting. If it has changed it, we must keep it.