diff mbox series

KVM: nVMX/nSVM: Fix bug which sets vcpu->arch.tsc_offset to L1 tsc_offset

Message ID 20181106101425.83721-1-liran.alon@oracle.com (mailing list archive)
State New, archived
Headers show
Series KVM: nVMX/nSVM: Fix bug which sets vcpu->arch.tsc_offset to L1 tsc_offset | expand

Commit Message

Liran Alon Nov. 6, 2018, 10:14 a.m. UTC
From: Leonid Shatz <leonid.shatz@oracle.com>

Since commit e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to
represent the running guest"), vcpu->arch.tsc_offset meaning was
changed to always reflect the tsc_offset value set on active VMCS.
Regardless if vCPU is currently running L1 or L2.

However, above mentioned commit failed to also change
kvm_vcpu_write_tsc_offset() to set vcpu->arch.tsc_offset correctly.
This is because vmx_write_tsc_offset() could set the tsc_offset value
in active VMCS to given offset parameter *plus vmcs12->tsc_offset*.
However, kvm_vcpu_write_tsc_offset() just sets vcpu->arch.tsc_offset
to given offset parameter. Without taking into account the possible
addition of vmcs12->tsc_offset. (Same is true for SVM case).

Fix this issue by changing kvm_x86_ops->write_tsc_offset() to return
actually set tsc_offset in active VMCS and modify
kvm_vcpu_write_tsc_offset() to set returned value in
vcpu->arch.tsc_offset.
In addition, rename write_tsc_offset() callback to write_l1_tsc_offset()
to make it clear that it is meant to set L1 TSC offset.

Fixes: e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest")

Reviewed-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Mihai Carabas <mihai.carabas@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Signed-off-by: Leonid Shatz <leonid.shatz@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/svm.c              |  5 +++--
 arch/x86/kvm/vmx.c              | 18 +++++++++---------
 arch/x86/kvm/x86.c              |  6 +++---
 4 files changed, 17 insertions(+), 15 deletions(-)

Comments

Jim Mattson Nov. 8, 2018, 12:24 a.m. UTC | #1
On Tue, Nov 6, 2018 at 2:14 AM, Liran Alon <liran.alon@oracle.com> wrote:
> From: Leonid Shatz <leonid.shatz@oracle.com>
>
> Since commit e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to
> represent the running guest"), vcpu->arch.tsc_offset meaning was
> changed to always reflect the tsc_offset value set on active VMCS.
> Regardless if vCPU is currently running L1 or L2.
>
> However, above mentioned commit failed to also change
> kvm_vcpu_write_tsc_offset() to set vcpu->arch.tsc_offset correctly.
> This is because vmx_write_tsc_offset() could set the tsc_offset value
> in active VMCS to given offset parameter *plus vmcs12->tsc_offset*.
> However, kvm_vcpu_write_tsc_offset() just sets vcpu->arch.tsc_offset
> to given offset parameter. Without taking into account the possible
> addition of vmcs12->tsc_offset. (Same is true for SVM case).
>
> Fix this issue by changing kvm_x86_ops->write_tsc_offset() to return
> actually set tsc_offset in active VMCS and modify
> kvm_vcpu_write_tsc_offset() to set returned value in
> vcpu->arch.tsc_offset.
> In addition, rename write_tsc_offset() callback to write_l1_tsc_offset()
> to make it clear that it is meant to set L1 TSC offset.

I think write_l1_tsc_offset() is misleading, since the TSC offset
that's actually written in guest mode is L2's TSC offset. Perhaps it
would be more clear to simply rename the argument from 'offset' to
'l1_tsc_offset'?

> Fixes: e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest")
>
> Reviewed-by: Liran Alon <liran.alon@oracle.com>
> Reviewed-by: Mihai Carabas <mihai.carabas@oracle.com>
> Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
> Signed-off-by: Leonid Shatz <leonid.shatz@oracle.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 ++-
>  arch/x86/kvm/svm.c              |  5 +++--
>  arch/x86/kvm/vmx.c              | 18 +++++++++---------
>  arch/x86/kvm/x86.c              |  6 +++---
>  4 files changed, 17 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 55e51ff7e421..fbda5a917c5b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1094,7 +1094,8 @@ struct kvm_x86_ops {
>         bool (*has_wbinvd_exit)(void);
>
>         u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
> -       void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> +       /* Returns actual tsc_offset set in active VMCS */

'VMCS' is Intel-centric. Perhaps 'Returns the actual TSC offset set
for the active guest'?

> +       u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>
>         void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 0e21ccc46792..db788dc5f1e7 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
>         return vcpu->arch.tsc_offset;
>  }
>
> -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> +static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
>         struct vcpu_svm *svm = to_svm(vcpu);
>         u64 g_tsc_offset = 0;
> @@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
>
>         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> +       return svm->vmcb->control.tsc_offset;
>  }
>
>  static void avic_init_vmcb(struct vcpu_svm *svm)
> @@ -7149,7 +7150,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>         .has_wbinvd_exit = svm_has_wbinvd_exit,
>
>         .read_l1_tsc_offset = svm_read_l1_tsc_offset,
> -       .write_tsc_offset = svm_write_tsc_offset,
> +       .write_l1_tsc_offset = svm_write_l1_tsc_offset,
>
>         .set_tdp_cr3 = set_tdp_cr3,
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 4555077d69ce..59633175fe6c 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3455,8 +3455,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
>  /*
>   * writes 'offset' into guest's timestamp counter offset register
>   */

The comment above needs some clarification, since 'offset' is only
written to the guest's TSC offset field if L1 is active. When L2 is
active, a different value is calculated and written to the guest's TSC
offset field.

> -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
> +       u64 active_offset = offset;
>         if (is_guest_mode(vcpu)) {
>                 /*
>                  * We're here if L1 chose not to trap WRMSR to TSC. According
> @@ -3464,17 +3465,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>                  * set for L2 remains unchanged, and still needs to be added
>                  * to the newly set TSC to get L2's TSC.
>                  */
> -               struct vmcs12 *vmcs12;
> -               /* recalculate vmcs02.TSC_OFFSET: */
> -               vmcs12 = get_vmcs12(vcpu);
> -               vmcs_write64(TSC_OFFSET, offset +
> -                       (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
> -                        vmcs12->tsc_offset : 0));
> +               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +               if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
> +                       active_offset += vmcs12->tsc_offset;
>         } else {
>                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
>                                            vmcs_read64(TSC_OFFSET), offset);

Why do we only trace for L1?

> -               vmcs_write64(TSC_OFFSET, offset);
>         }
> +
> +       vmcs_write64(TSC_OFFSET, active_offset);
> +       return active_offset;
>  }
Liran Alon Nov. 8, 2018, 8:18 a.m. UTC | #2
> On 8 Nov 2018, at 2:24, Jim Mattson <jmattson@google.com> wrote:
> 
> On Tue, Nov 6, 2018 at 2:14 AM, Liran Alon <liran.alon@oracle.com> wrote:
>> From: Leonid Shatz <leonid.shatz@oracle.com>
>> 
>> Since commit e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to
>> represent the running guest"), vcpu->arch.tsc_offset meaning was
>> changed to always reflect the tsc_offset value set on active VMCS.
>> Regardless if vCPU is currently running L1 or L2.
>> 
>> However, above mentioned commit failed to also change
>> kvm_vcpu_write_tsc_offset() to set vcpu->arch.tsc_offset correctly.
>> This is because vmx_write_tsc_offset() could set the tsc_offset value
>> in active VMCS to given offset parameter *plus vmcs12->tsc_offset*.
>> However, kvm_vcpu_write_tsc_offset() just sets vcpu->arch.tsc_offset
>> to given offset parameter. Without taking into account the possible
>> addition of vmcs12->tsc_offset. (Same is true for SVM case).
>> 
>> Fix this issue by changing kvm_x86_ops->write_tsc_offset() to return
>> actually set tsc_offset in active VMCS and modify
>> kvm_vcpu_write_tsc_offset() to set returned value in
>> vcpu->arch.tsc_offset.
>> In addition, rename write_tsc_offset() callback to write_l1_tsc_offset()
>> to make it clear that it is meant to set L1 TSC offset.
> 
> I think write_l1_tsc_offset() is misleading, since the TSC offset
> that's actually written in guest mode is L2's TSC offset. Perhaps it
> would be more clear to simply rename the argument from 'offset' to
> 'l1_tsc_offset’?

I don’t have strong opinion on this as long as it will be clear that argument is “l1_tsc_offset”.
So I’m fine with your suggestion done when applied.

> 
>> Fixes: e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest")
>> 
>> Reviewed-by: Liran Alon <liran.alon@oracle.com>
>> Reviewed-by: Mihai Carabas <mihai.carabas@oracle.com>
>> Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
>> Signed-off-by: Leonid Shatz <leonid.shatz@oracle.com>
>> ---
>> arch/x86/include/asm/kvm_host.h |  3 ++-
>> arch/x86/kvm/svm.c              |  5 +++--
>> arch/x86/kvm/vmx.c              | 18 +++++++++---------
>> arch/x86/kvm/x86.c              |  6 +++---
>> 4 files changed, 17 insertions(+), 15 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 55e51ff7e421..fbda5a917c5b 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1094,7 +1094,8 @@ struct kvm_x86_ops {
>>        bool (*has_wbinvd_exit)(void);
>> 
>>        u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
>> -       void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>> +       /* Returns actual tsc_offset set in active VMCS */
> 
> 'VMCS' is Intel-centric. Perhaps 'Returns the actual TSC offset set
> for the active guest’?

Fine with doing this comment change when applying.

> 
>> +       u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>> 
>>        void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
>> 
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 0e21ccc46792..db788dc5f1e7 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
>>        return vcpu->arch.tsc_offset;
>> }
>> 
>> -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> +static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> {
>>        struct vcpu_svm *svm = to_svm(vcpu);
>>        u64 g_tsc_offset = 0;
>> @@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>>        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
>> 
>>        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
>> +       return svm->vmcb->control.tsc_offset;
>> }
>> 
>> static void avic_init_vmcb(struct vcpu_svm *svm)
>> @@ -7149,7 +7150,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>>        .has_wbinvd_exit = svm_has_wbinvd_exit,
>> 
>>        .read_l1_tsc_offset = svm_read_l1_tsc_offset,
>> -       .write_tsc_offset = svm_write_tsc_offset,
>> +       .write_l1_tsc_offset = svm_write_l1_tsc_offset,
>> 
>>        .set_tdp_cr3 = set_tdp_cr3,
>> 
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 4555077d69ce..59633175fe6c 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -3455,8 +3455,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
>> /*
>>  * writes 'offset' into guest's timestamp counter offset register
>>  */
> 
> The comment above needs some clarification, since 'offset' is only
> written to the guest's TSC offset field if L1 is active. When L2 is
> active, a different value is calculated and written to the guest's TSC
> offset field.

What about the following as an alternative:
/* Updates active guest TSC offset based on given L1 TSC offset */

> 
>> -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> {
>> +       u64 active_offset = offset;
>>        if (is_guest_mode(vcpu)) {
>>                /*
>>                 * We're here if L1 chose not to trap WRMSR to TSC. According
>> @@ -3464,17 +3465,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>>                 * set for L2 remains unchanged, and still needs to be added
>>                 * to the newly set TSC to get L2's TSC.
>>                 */
>> -               struct vmcs12 *vmcs12;
>> -               /* recalculate vmcs02.TSC_OFFSET: */
>> -               vmcs12 = get_vmcs12(vcpu);
>> -               vmcs_write64(TSC_OFFSET, offset +
>> -                       (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
>> -                        vmcs12->tsc_offset : 0));
>> +               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>> +               if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
>> +                       active_offset += vmcs12->tsc_offset;
>>        } else {
>>                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
>>                                           vmcs_read64(TSC_OFFSET), offset);
> 
> Why do we only trace for L1?

I thought about the exact same thing.
Didn’t want to change this on this patch though as it is unrelated.
I also think we should create a separate patch that moves trace to happen in both cases.

> 
>> -               vmcs_write64(TSC_OFFSET, offset);
>>        }
>> +
>> +       vmcs_write64(TSC_OFFSET, active_offset);
>> +       return active_offset;
>> }


Paolo/Radim, do you wish that we will send v2 for these comments/renames suggestions
or you will perform these changes when applying?

Thanks,
-Liran
Paolo Bonzini Nov. 25, 2018, 5:42 p.m. UTC | #3
On 08/11/18 01:24, Jim Mattson wrote:
> On Tue, Nov 6, 2018 at 2:14 AM, Liran Alon <liran.alon@oracle.com> wrote:
>> From: Leonid Shatz <leonid.shatz@oracle.com>
>>
>> Since commit e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to
>> represent the running guest"), vcpu->arch.tsc_offset meaning was
>> changed to always reflect the tsc_offset value set on active VMCS.
>> Regardless if vCPU is currently running L1 or L2.
>>
>> However, above mentioned commit failed to also change
>> kvm_vcpu_write_tsc_offset() to set vcpu->arch.tsc_offset correctly.
>> This is because vmx_write_tsc_offset() could set the tsc_offset value
>> in active VMCS to given offset parameter *plus vmcs12->tsc_offset*.
>> However, kvm_vcpu_write_tsc_offset() just sets vcpu->arch.tsc_offset
>> to given offset parameter. Without taking into account the possible
>> addition of vmcs12->tsc_offset. (Same is true for SVM case).
>>
>> Fix this issue by changing kvm_x86_ops->write_tsc_offset() to return
>> actually set tsc_offset in active VMCS and modify
>> kvm_vcpu_write_tsc_offset() to set returned value in
>> vcpu->arch.tsc_offset.
>> In addition, rename write_tsc_offset() callback to write_l1_tsc_offset()
>> to make it clear that it is meant to set L1 TSC offset.
> 
> I think write_l1_tsc_offset() is misleading, since the TSC offset
> that's actually written in guest mode is L2's TSC offset. Perhaps it
> would be more clear to simply rename the argument from 'offset' to
> 'l1_tsc_offset'?
> 
>> Fixes: e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest")
>>
>> Reviewed-by: Liran Alon <liran.alon@oracle.com>
>> Reviewed-by: Mihai Carabas <mihai.carabas@oracle.com>
>> Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
>> Signed-off-by: Leonid Shatz <leonid.shatz@oracle.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |  3 ++-
>>  arch/x86/kvm/svm.c              |  5 +++--
>>  arch/x86/kvm/vmx.c              | 18 +++++++++---------
>>  arch/x86/kvm/x86.c              |  6 +++---
>>  4 files changed, 17 insertions(+), 15 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 55e51ff7e421..fbda5a917c5b 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1094,7 +1094,8 @@ struct kvm_x86_ops {
>>         bool (*has_wbinvd_exit)(void);
>>
>>         u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
>> -       void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>> +       /* Returns actual tsc_offset set in active VMCS */
> 
> 'VMCS' is Intel-centric. Perhaps 'Returns the actual TSC offset set
> for the active guest'?
> 
>> +       u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>>
>>         void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
>>
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 0e21ccc46792..db788dc5f1e7 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
>>         return vcpu->arch.tsc_offset;
>>  }
>>
>> -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> +static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>>  {
>>         struct vcpu_svm *svm = to_svm(vcpu);
>>         u64 g_tsc_offset = 0;
>> @@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>>         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
>>
>>         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
>> +       return svm->vmcb->control.tsc_offset;
>>  }
>>
>>  static void avic_init_vmcb(struct vcpu_svm *svm)
>> @@ -7149,7 +7150,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>>         .has_wbinvd_exit = svm_has_wbinvd_exit,
>>
>>         .read_l1_tsc_offset = svm_read_l1_tsc_offset,
>> -       .write_tsc_offset = svm_write_tsc_offset,
>> +       .write_l1_tsc_offset = svm_write_l1_tsc_offset,
>>
>>         .set_tdp_cr3 = set_tdp_cr3,
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 4555077d69ce..59633175fe6c 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -3455,8 +3455,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
>>  /*
>>   * writes 'offset' into guest's timestamp counter offset register
>>   */
> 
> The comment above needs some clarification, since 'offset' is only
> written to the guest's TSC offset field if L1 is active. When L2 is
> active, a different value is calculated and written to the guest's TSC
> offset field.

I just removed the comment, it's more misleading than anything else.

I don't like that much the new kvm_x86_ops name, but at least it's
consistent with read_l1_tsc_offset.

With some cleanup and making the trace event available for L2 writes
too, I'm squashing this in:

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2085539149ed..f03c46c183d8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3464,27 +3464,26 @@ static u64 vmx_read_l1_tsc_offset(struct
kvm_vcpu *vcpu)
 	return vcpu->arch.tsc_offset;
 }

-/*
- * writes 'offset' into guest's timestamp counter offset register
- */
 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	u64 old_offset = vcpu->arch.tsc_offset;
 	u64 active_offset = offset;
-	if (is_guest_mode(vcpu)) {
+
+	if (is_guest_mode(vcpu) &&
+	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) {
 		/*
 		 * We're here if L1 chose not to trap WRMSR to TSC. According
 		 * to the spec, this should set L1's TSC; The offset that L1
 		 * set for L2 remains unchanged, and still needs to be added
 		 * to the newly set TSC to get L2's TSC.
 		 */
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
-			active_offset += vmcs12->tsc_offset;
-	} else {
-		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-					   vmcs_read64(TSC_OFFSET), offset);
+		old_offset -= vmcs12->tsc_offset;
+		active_offset += vmcs12->tsc_offset;
 	}

+	trace_kvm_write_tsc_offset(vcpu->vcpu_id, old_offset, offset);
 	vmcs_write64(TSC_OFFSET, active_offset);
 	return active_offset;
 }

Paolo

>> -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>> +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>>  {
>> +       u64 active_offset = offset;
>>         if (is_guest_mode(vcpu)) {
>>                 /*
>>                  * We're here if L1 chose not to trap WRMSR to TSC. According
>> @@ -3464,17 +3465,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>>                  * set for L2 remains unchanged, and still needs to be added
>>                  * to the newly set TSC to get L2's TSC.
>>                  */
>> -               struct vmcs12 *vmcs12;
>> -               /* recalculate vmcs02.TSC_OFFSET: */
>> -               vmcs12 = get_vmcs12(vcpu);
>> -               vmcs_write64(TSC_OFFSET, offset +
>> -                       (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
>> -                        vmcs12->tsc_offset : 0));
>> +               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>> +               if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
>> +                       active_offset += vmcs12->tsc_offset;
>>         } else {
>>                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
>>                                            vmcs_read64(TSC_OFFSET), offset);
> 
> Why do we only trace for L1?
> 
>> -               vmcs_write64(TSC_OFFSET, offset);
>>         }
>> +
>> +       vmcs_write64(TSC_OFFSET, active_offset);
>> +       return active_offset;
>>  }
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55e51ff7e421..fbda5a917c5b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1094,7 +1094,8 @@  struct kvm_x86_ops {
 	bool (*has_wbinvd_exit)(void);
 
 	u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
-	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+	/* Returns actual tsc_offset set in active VMCS */
+	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e21ccc46792..db788dc5f1e7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1446,7 +1446,7 @@  static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 	return vcpu->arch.tsc_offset;
 }
 
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 g_tsc_offset = 0;
@@ -1464,6 +1464,7 @@  static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+	return svm->vmcb->control.tsc_offset;
 }
 
 static void avic_init_vmcb(struct vcpu_svm *svm)
@@ -7149,7 +7150,7 @@  static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
 	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
-	.write_tsc_offset = svm_write_tsc_offset,
+	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
 
 	.set_tdp_cr3 = set_tdp_cr3,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4555077d69ce..59633175fe6c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3455,8 +3455,9 @@  static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
+	u64 active_offset = offset;
 	if (is_guest_mode(vcpu)) {
 		/*
 		 * We're here if L1 chose not to trap WRMSR to TSC. According
@@ -3464,17 +3465,16 @@  static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 		 * set for L2 remains unchanged, and still needs to be added
 		 * to the newly set TSC to get L2's TSC.
 		 */
-		struct vmcs12 *vmcs12;
-		/* recalculate vmcs02.TSC_OFFSET: */
-		vmcs12 = get_vmcs12(vcpu);
-		vmcs_write64(TSC_OFFSET, offset +
-			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
-			 vmcs12->tsc_offset : 0));
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
+			active_offset += vmcs12->tsc_offset;
 	} else {
 		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
 					   vmcs_read64(TSC_OFFSET), offset);
-		vmcs_write64(TSC_OFFSET, offset);
 	}
+
+	vmcs_write64(TSC_OFFSET, active_offset);
+	return active_offset;
 }
 
 /*
@@ -15062,7 +15062,7 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
 	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
-	.write_tsc_offset = vmx_write_tsc_offset,
+	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
 
 	.set_tdp_cr3 = vmx_set_cr3,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8525dc75a79a..a8811e395216 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1768,8 +1768,7 @@  EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
-	kvm_x86_ops->write_tsc_offset(vcpu, offset);
-	vcpu->arch.tsc_offset = offset;
+	vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
 }
 
 static inline bool kvm_check_tsc_unstable(void)
@@ -1897,7 +1896,8 @@  EXPORT_SYMBOL_GPL(kvm_write_tsc);
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 					   s64 adjustment)
 {
-	kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
+	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
 }
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)