diff mbox series

[v2,07/10] KVM: X86: Move write_l1_tsc_offset() logic to common code and rename it

Message ID 20210512150945.4591-8-ilstam@amazon.com (mailing list archive)
State New, archived
Headers show
Series KVM: Implement nested TSC scaling | expand

Commit Message

Stamatis, Ilias May 12, 2021, 3:09 p.m. UTC
The write_l1_tsc_offset() callback has a misleading name. It does not
set L1's TSC offset, it rather updates the current TSC offset which
might be different if a nested guest is executing. Additionally, both
the vmx and svm implementations use the same logic for calculating the
current TSC before writing it to hardware.

This patch renames the function and moves the common logic to the
caller. The vmx/svm-specific code now merely sets the given offset to
the corresponding hardware structure.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  2 +-
 arch/x86/include/asm/kvm_host.h    |  3 +--
 arch/x86/kvm/svm/svm.c             | 21 ++++-----------------
 arch/x86/kvm/vmx/vmx.c             | 23 +++--------------------
 arch/x86/kvm/x86.c                 | 17 ++++++++++++++++-
 5 files changed, 25 insertions(+), 41 deletions(-)

Comments

Sean Christopherson May 19, 2021, 12:05 a.m. UTC | #1
On Wed, May 12, 2021, Ilias Stamatis wrote:
> The write_l1_tsc_offset() callback has a misleading name. It does not
> set L1's TSC offset, it rather updates the current TSC offset which
> might be different if a nested guest is executing. Additionally, both
> the vmx and svm implementations use the same logic for calculating the
> current TSC before writing it to hardware.

I don't disagree, but the current name as the advantage of clarifying (well,
hinting) that the offset is L1's offset.  That hint is lost in this refactoring.
Maybe rename "u64 offset" to "u64 l1_tsc_offset"?

> This patch renames the function and moves the common logic to the

Use imperative mood instead of "This patch.  From 
Documentation/process/submitting-patches.rst:

  Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
  instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
  to do frotz", as if you are giving orders to the codebase to change
  its behaviour.

> caller. The vmx/svm-specific code now merely sets the given offset to
> the corresponding hardware structure.
> 
> Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  2 +-
>  arch/x86/include/asm/kvm_host.h    |  3 +--
>  arch/x86/kvm/svm/svm.c             | 21 ++++-----------------
>  arch/x86/kvm/vmx/vmx.c             | 23 +++--------------------
>  arch/x86/kvm/x86.c                 | 17 ++++++++++++++++-
>  5 files changed, 25 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 2063616fba1c..029c9615378f 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -89,7 +89,7 @@ KVM_X86_OP(load_mmu_pgd)
>  KVM_X86_OP_NULL(has_wbinvd_exit)
>  KVM_X86_OP(get_l2_tsc_offset)
>  KVM_X86_OP(get_l2_tsc_multiplier)
> -KVM_X86_OP(write_l1_tsc_offset)
> +KVM_X86_OP(write_tsc_offset)
>  KVM_X86_OP(get_exit_info)
>  KVM_X86_OP(check_intercept)
>  KVM_X86_OP(handle_exit_irqoff)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 57a25d8e8b0f..61cf201c001a 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1307,8 +1307,7 @@ struct kvm_x86_ops {
>  
>  	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
>  	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
> -	/* Returns actual tsc_offset set in active VMCS */
> -	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> +	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>  
>  	/*
>  	 * Retrieve somewhat arbitrary exit information.  Intended to be used
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 679b2fc1a3f9..b18f60463073 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1094,26 +1094,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
>  	return kvm_default_tsc_scaling_ratio;
>  }
>  
> -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> -	u64 g_tsc_offset = 0;
> -
> -	if (is_guest_mode(vcpu)) {
> -		/* Write L1's TSC offset.  */
> -		g_tsc_offset = svm->vmcb->control.tsc_offset -
> -			       svm->vmcb01.ptr->control.tsc_offset;
> -		svm->vmcb01.ptr->control.tsc_offset = offset;
> -	}
> -
> -	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> -				   svm->vmcb->control.tsc_offset - g_tsc_offset,
> -				   offset);
> -
> -	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
>  
> +	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
> +	svm->vmcb->control.tsc_offset = offset;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> -	return svm->vmcb->control.tsc_offset;
>  }
>  
>  /* Evaluate instruction intercepts that depend on guest CPUID features. */
> @@ -4540,7 +4527,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  
>  	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
>  	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
> -	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
> +	.write_tsc_offset = svm_write_tsc_offset,
>  
>  	.load_mmu_pgd = svm_load_mmu_pgd,
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 575e13bddda8..3c4eb14a1e86 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1810,26 +1810,9 @@ static u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
>  	return multiplier;
>  }
>  
> -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
> -	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> -	u64 g_tsc_offset = 0;
> -
> -	/*
> -	 * We're here if L1 chose not to trap WRMSR to TSC. According
> -	 * to the spec, this should set L1's TSC; The offset that L1
> -	 * set for L2 remains unchanged, and still needs to be added
> -	 * to the newly set TSC to get L2's TSC.
> -	 */
> -	if (is_guest_mode(vcpu) &&
> -	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
> -		g_tsc_offset = vmcs12->tsc_offset;
> -
> -	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> -				   vcpu->arch.tsc_offset - g_tsc_offset,
> -				   offset);
> -	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
> -	return offset + g_tsc_offset;
> +	vmcs_write64(TSC_OFFSET, offset);
>  }
>  
>  /*
> @@ -7725,7 +7708,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
>  
>  	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
>  	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
> -	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
> +	.write_tsc_offset = vmx_write_tsc_offset,
>  
>  	.load_mmu_pgd = vmx_load_mmu_pgd,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1db6cfc2079f..f3ba1be4d5b9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2377,8 +2377,23 @@ EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier);
>  
>  static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
> +	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> +				   vcpu->arch.l1_tsc_offset,
> +				   offset);
> +
>  	vcpu->arch.l1_tsc_offset = offset;
> -	vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
> +	vcpu->arch.tsc_offset = offset;
> +
> +	if (is_guest_mode(vcpu)) {

Unnecessary curly braces.

> +		/*
> +		 * We're here if L1 chose not to trap WRMSR to TSC and
> +		 * according to the spec this should set L1's TSC (as opposed
> +		 * to setting L1's offset for L2).
> +		 */

While we're shuffling code, can we improve this comment?  It works for the WRMSR
case, but makes no sense in the context of host TSC adjustments.  It's not at all
clear to me that it's even correct or relevant in those cases.

> +		kvm_set_02_tsc_offset(vcpu);

I really dislike that kvm_set_02_tsc_offset() consumes a bunch of variables
_and_ sets arch.tsc_offset, e.g. it's not at all obvious that moving this call
around will break L2.  Even more bizarre is that arch.tsc_offset is conditionally
consumed.  Oh, and kvm_set_02_tsc_offset() is not idempotent since it can do a
RMW on arch.tsc_offset.

The below static_call() dependency doesn't appear to be easily solved, but we
can at least strongly hint that vcpu->arch.tsc_offset is set.  For everything
else, I think we can clean things up by doing this (with the vendor calls
providing the L2 variables directly):

void kvm_set_l2_tsc_offset(struct kvm_vcpu *vcpu, u64 l2_offset,
			   u64 l2_multiplier)
{
	u64 l1_offset = vcpu->arch.l1_tsc_offset;

	if (l2_multiplier != kvm_default_tsc_scaling_ratio)
		l2_offset += mul_s64_u64_shr((s64)l1_tsc_offset, l2_multiplier,
					     kvm_tsc_scaling_ratio_frac_bits);

	vcpu->arch.tsc_offset = l2_offset;
}
EXPORT_SYMBOL_GPL(kvm_get_l2_tsc_offset);

static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
				   vcpu->arch.l1_tsc_offset,
				   offset);

	vcpu->arch.l1_tsc_offset = offset;

	if (is_guest_mode(vcpu))
		kvm_set_l2_tsc_offset(vcpu,
				      static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
				      static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
	else
		vcpu->arch.tsc_offset = offset;

	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
}


An alternative would be to explicitly track L1 and L2, and _never_ track the
current TSC values.  I.e. always compute the correct value on the fly.  I think
the only hot path is the TSC deadline timer, and AFAICT that always runs with
L1's timer.  Speaking of which, at the end of this series, vmx_set_hv_timer()
uses L1's TSC but the current scaling ratio; that can't be right.

> +	}
> +
> +	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
>  }
>  
>  static inline bool kvm_check_tsc_unstable(void)
> -- 
> 2.17.1
>
Stamatis, Ilias May 19, 2021, 11:45 a.m. UTC | #2
On Wed, 2021-05-19 at 00:05 +0000, Sean Christopherson wrote:
> On Wed, May 12, 2021, Ilias Stamatis wrote:
> > The write_l1_tsc_offset() callback has a misleading name. It does not
> > set L1's TSC offset, it rather updates the current TSC offset which
> > might be different if a nested guest is executing. Additionally, both
> > the vmx and svm implementations use the same logic for calculating the
> > current TSC before writing it to hardware.
> 
> I don't disagree, but the current name as the advantage of clarifying (well,
> hinting) that the offset is L1's offset.  That hint is lost in this refactoring.
> Maybe rename "u64 offset" to "u64 l1_tsc_offset"?
> 
> > This patch renames the function and moves the common logic to the
> 
> Use imperative mood instead of "This patch.  From
> Documentation/process/submitting-patches.rst:
> 
>   Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
>   instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
>   to do frotz", as if you are giving orders to the codebase to change
>   its behaviour.
> 
> > caller. The vmx/svm-specific code now merely sets the given offset to
> > the corresponding hardware structure.
> > 
> > Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> > ---
> >  arch/x86/include/asm/kvm-x86-ops.h |  2 +-
> >  arch/x86/include/asm/kvm_host.h    |  3 +--
> >  arch/x86/kvm/svm/svm.c             | 21 ++++-----------------
> >  arch/x86/kvm/vmx/vmx.c             | 23 +++--------------------
> >  arch/x86/kvm/x86.c                 | 17 ++++++++++++++++-
> >  5 files changed, 25 insertions(+), 41 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > index 2063616fba1c..029c9615378f 100644
> > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > @@ -89,7 +89,7 @@ KVM_X86_OP(load_mmu_pgd)
> >  KVM_X86_OP_NULL(has_wbinvd_exit)
> >  KVM_X86_OP(get_l2_tsc_offset)
> >  KVM_X86_OP(get_l2_tsc_multiplier)
> > -KVM_X86_OP(write_l1_tsc_offset)
> > +KVM_X86_OP(write_tsc_offset)
> >  KVM_X86_OP(get_exit_info)
> >  KVM_X86_OP(check_intercept)
> >  KVM_X86_OP(handle_exit_irqoff)
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 57a25d8e8b0f..61cf201c001a 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1307,8 +1307,7 @@ struct kvm_x86_ops {
> > 
> >       u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> >       u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
> > -     /* Returns actual tsc_offset set in active VMCS */
> > -     u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> > +     void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> > 
> >       /*
> >        * Retrieve somewhat arbitrary exit information.  Intended to be used
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 679b2fc1a3f9..b18f60463073 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -1094,26 +1094,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
> >       return kvm_default_tsc_scaling_ratio;
> >  }
> > 
> > -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> >  {
> >       struct vcpu_svm *svm = to_svm(vcpu);
> > -     u64 g_tsc_offset = 0;
> > -
> > -     if (is_guest_mode(vcpu)) {
> > -             /* Write L1's TSC offset.  */
> > -             g_tsc_offset = svm->vmcb->control.tsc_offset -
> > -                            svm->vmcb01.ptr->control.tsc_offset;
> > -             svm->vmcb01.ptr->control.tsc_offset = offset;
> > -     }
> > -
> > -     trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > -                                svm->vmcb->control.tsc_offset - g_tsc_offset,
> > -                                offset);
> > -
> > -     svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
> > 
> > +     svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
> > +     svm->vmcb->control.tsc_offset = offset;
> >       vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> > -     return svm->vmcb->control.tsc_offset;
> >  }
> > 
> >  /* Evaluate instruction intercepts that depend on guest CPUID features. */
> > @@ -4540,7 +4527,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> > 
> >       .get_l2_tsc_offset = svm_get_l2_tsc_offset,
> >       .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
> > -     .write_l1_tsc_offset = svm_write_l1_tsc_offset,
> > +     .write_tsc_offset = svm_write_tsc_offset,
> > 
> >       .load_mmu_pgd = svm_load_mmu_pgd,
> > 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 575e13bddda8..3c4eb14a1e86 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1810,26 +1810,9 @@ static u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
> >       return multiplier;
> >  }
> > 
> > -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> >  {
> > -     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> > -     u64 g_tsc_offset = 0;
> > -
> > -     /*
> > -      * We're here if L1 chose not to trap WRMSR to TSC. According
> > -      * to the spec, this should set L1's TSC; The offset that L1
> > -      * set for L2 remains unchanged, and still needs to be added
> > -      * to the newly set TSC to get L2's TSC.
> > -      */
> > -     if (is_guest_mode(vcpu) &&
> > -         (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
> > -             g_tsc_offset = vmcs12->tsc_offset;
> > -
> > -     trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > -                                vcpu->arch.tsc_offset - g_tsc_offset,
> > -                                offset);
> > -     vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
> > -     return offset + g_tsc_offset;
> > +     vmcs_write64(TSC_OFFSET, offset);
> >  }
> > 
> >  /*
> > @@ -7725,7 +7708,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
> > 
> >       .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
> >       .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
> > -     .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
> > +     .write_tsc_offset = vmx_write_tsc_offset,
> > 
> >       .load_mmu_pgd = vmx_load_mmu_pgd,
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 1db6cfc2079f..f3ba1be4d5b9 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2377,8 +2377,23 @@ EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier);
> > 
> >  static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> >  {
> > +     trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > +                                vcpu->arch.l1_tsc_offset,
> > +                                offset);
> > +
> >       vcpu->arch.l1_tsc_offset = offset;
> > -     vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
> > +     vcpu->arch.tsc_offset = offset;
> > +
> > +     if (is_guest_mode(vcpu)) {
> 
> Unnecessary curly braces.

Really? We are supposed to have a 6-lines body without brackets? I'm not
opposing, I'm just surprised that that's the coding standard.

> 
> > +             /*
> > +              * We're here if L1 chose not to trap WRMSR to TSC and
> > +              * according to the spec this should set L1's TSC (as opposed
> > +              * to setting L1's offset for L2).
> > +              */
> 
> While we're shuffling code, can we improve this comment?  It works for the WRMSR
> case, but makes no sense in the context of host TSC adjustments.  It's not at all
> clear to me that it's even correct or relevant in those cases.
> 

Do you suggest removing it completely or how do you want it to be? I don't
mind deleting it.

> > +             kvm_set_02_tsc_offset(vcpu);
> 
> I really dislike that kvm_set_02_tsc_offset() consumes a bunch of variables
> _and_ sets arch.tsc_offset, e.g. it's not at all obvious that moving this call
> around will break L2.  Even more bizarre is that arch.tsc_offset is conditionally
> consumed.  Oh, and kvm_set_02_tsc_offset() is not idempotent since it can do a
> RMW on arch.tsc_offset.
> 
> The below static_call() dependency doesn't appear to be easily solved, but we
> can at least strongly hint that vcpu->arch.tsc_offset is set.  For everything
> else, I think we can clean things up by doing this (with the vendor calls
> providing the L2 variables directly):
> 
> void kvm_set_l2_tsc_offset(struct kvm_vcpu *vcpu, u64 l2_offset,
>                            u64 l2_multiplier)
> {
>         u64 l1_offset = vcpu->arch.l1_tsc_offset;
> 
>         if (l2_multiplier != kvm_default_tsc_scaling_ratio)
>                 l2_offset += mul_s64_u64_shr((s64)l1_tsc_offset, l2_multiplier,
>                                              kvm_tsc_scaling_ratio_frac_bits);
> 
>         vcpu->arch.tsc_offset = l2_offset;
> }
> EXPORT_SYMBOL_GPL(kvm_get_l2_tsc_offset);
> 
> static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> {
>         trace_kvm_write_tsc_offset(vcpu->vcpu_id,
>                                    vcpu->arch.l1_tsc_offset,
>                                    offset);
> 
>         vcpu->arch.l1_tsc_offset = offset;
> 
>         if (is_guest_mode(vcpu))
>                 kvm_set_l2_tsc_offset(vcpu,
>                                       static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
>                                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
>         else
>                 vcpu->arch.tsc_offset = offset;
> 
>         static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
> }
> 

OK, I will change it to what you suggest above.

> 
> An alternative would be to explicitly track L1 and L2, and _never_ track the
> current TSC values.  I.e. always compute the correct value on the fly.  I think
> the only hot path is the TSC deadline timer, and AFAICT that always runs with
> L1's timer.  Speaking of which, at the end of this series, vmx_set_hv_timer()
> uses L1's TSC but the current scaling ratio; that can't be right.
> 

You are right, good catch, I will add this to patch 2.

I think let's leave the vcpu->arch.tsc_scaling_ratio variable as is for now.

> > +     }
> > +
> > +     static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
> >  }
> > 
> >  static inline bool kvm_check_tsc_unstable(void)
> > --
> > 2.17.1
> >
Sean Christopherson May 19, 2021, 3:49 p.m. UTC | #3
On Wed, May 19, 2021, Stamatis, Ilias wrote:
> On Wed, 2021-05-19 at 00:05 +0000, Sean Christopherson wrote:
> > On Wed, May 12, 2021, Ilias Stamatis wrote:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 1db6cfc2079f..f3ba1be4d5b9 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -2377,8 +2377,23 @@ EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier);
> > > 
> > >  static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > >  {
> > > +     trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > > +                                vcpu->arch.l1_tsc_offset,
> > > +                                offset);
> > > +
> > >       vcpu->arch.l1_tsc_offset = offset;
> > > -     vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
> > > +     vcpu->arch.tsc_offset = offset;
> > > +
> > > +     if (is_guest_mode(vcpu)) {
> > 
> > Unnecessary curly braces.
> 
> Really? We are supposed to have a 6-lines body without brackets? I'm not
> opposing, I'm just surprised that that's the coding standard.

Comments don't (technically) count.  I usually avoid the ambiguity by putting
the comment above the if statement.  That also helps with indentation, e.g.

	/*
	 * This is a comment.
	 */
	if (is_guest_mode(vcpu))
		kvm_set_02_tsc_offset(vcpu);

> > > +             /*
> > > +              * We're here if L1 chose not to trap WRMSR to TSC and
> > > +              * according to the spec this should set L1's TSC (as opposed
> > > +              * to setting L1's offset for L2).
> > > +              */
> > 
> > While we're shuffling code, can we improve this comment?  It works for the WRMSR
> > case, but makes no sense in the context of host TSC adjustments.  It's not at all
> > clear to me that it's even correct or relevant in those cases.
> > 
> 
> Do you suggest removing it completely or how do you want it to be? I don't
> mind deleting it.

Heh, I'd happily write the comment, except I have no idea what the logic is in
the non-WRMSR case.  I do think we need a comment, IMO none of paths that lead
to changing the TSC offset while L2 is active are obvious.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 2063616fba1c..029c9615378f 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -89,7 +89,7 @@  KVM_X86_OP(load_mmu_pgd)
 KVM_X86_OP_NULL(has_wbinvd_exit)
 KVM_X86_OP(get_l2_tsc_offset)
 KVM_X86_OP(get_l2_tsc_multiplier)
-KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(write_tsc_offset)
 KVM_X86_OP(get_exit_info)
 KVM_X86_OP(check_intercept)
 KVM_X86_OP(handle_exit_irqoff)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 57a25d8e8b0f..61cf201c001a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1307,8 +1307,7 @@  struct kvm_x86_ops {
 
 	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
 	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
-	/* Returns actual tsc_offset set in active VMCS */
-	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	/*
 	 * Retrieve somewhat arbitrary exit information.  Intended to be used
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 679b2fc1a3f9..b18f60463073 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1094,26 +1094,13 @@  static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
 	return kvm_default_tsc_scaling_ratio;
 }
 
-static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	u64 g_tsc_offset = 0;
-
-	if (is_guest_mode(vcpu)) {
-		/* Write L1's TSC offset.  */
-		g_tsc_offset = svm->vmcb->control.tsc_offset -
-			       svm->vmcb01.ptr->control.tsc_offset;
-		svm->vmcb01.ptr->control.tsc_offset = offset;
-	}
-
-	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-				   svm->vmcb->control.tsc_offset - g_tsc_offset,
-				   offset);
-
-	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 
+	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+	svm->vmcb->control.tsc_offset = offset;
 	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-	return svm->vmcb->control.tsc_offset;
 }
 
 /* Evaluate instruction intercepts that depend on guest CPUID features. */
@@ -4540,7 +4527,7 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
 	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
-	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
+	.write_tsc_offset = svm_write_tsc_offset,
 
 	.load_mmu_pgd = svm_load_mmu_pgd,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 575e13bddda8..3c4eb14a1e86 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1810,26 +1810,9 @@  static u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
 	return multiplier;
 }
 
-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u64 g_tsc_offset = 0;
-
-	/*
-	 * We're here if L1 chose not to trap WRMSR to TSC. According
-	 * to the spec, this should set L1's TSC; The offset that L1
-	 * set for L2 remains unchanged, and still needs to be added
-	 * to the newly set TSC to get L2's TSC.
-	 */
-	if (is_guest_mode(vcpu) &&
-	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
-		g_tsc_offset = vmcs12->tsc_offset;
-
-	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-				   vcpu->arch.tsc_offset - g_tsc_offset,
-				   offset);
-	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
-	return offset + g_tsc_offset;
+	vmcs_write64(TSC_OFFSET, offset);
 }
 
 /*
@@ -7725,7 +7708,7 @@  static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
 	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
-	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+	.write_tsc_offset = vmx_write_tsc_offset,
 
 	.load_mmu_pgd = vmx_load_mmu_pgd,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1db6cfc2079f..f3ba1be4d5b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2377,8 +2377,23 @@  EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier);
 
 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
+	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+				   vcpu->arch.l1_tsc_offset,
+				   offset);
+
 	vcpu->arch.l1_tsc_offset = offset;
-	vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+	vcpu->arch.tsc_offset = offset;
+
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * We're here if L1 chose not to trap WRMSR to TSC and
+		 * according to the spec this should set L1's TSC (as opposed
+		 * to setting L1's offset for L2).
+		 */
+		kvm_set_02_tsc_offset(vcpu);
+	}
+
+	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
 }
 
 static inline bool kvm_check_tsc_unstable(void)