diff mbox series

[v2,2/3] KVM: x86: Include host suspended time in steal time.

Message ID 20240820043543.837914-3-suleiman@google.com (mailing list archive)
State New
Headers show
Series KVM: x86: Include host suspended time in steal time. | expand

Commit Message

Suleiman Souhlal Aug. 20, 2024, 4:35 a.m. UTC
When the host resumes from a suspend, the guest thinks any task
that was running during the suspend ran for a long time, even though
the effective run time was much shorter, which can end up having
negative effects with scheduling. This can be particularly noticeable
if the guest task was RT, as it can end up getting throttled for a
long time.

To mitigate this issue, we include the time that the host was
suspended in steal time, which lets the guest subtract the duration from
the tasks' runtime.

Note that the case of a suspend happening during a VM migration
might not be accounted.

Signed-off-by: Suleiman Souhlal <suleiman@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

Comments

Chao Gao Aug. 21, 2024, 6:31 a.m. UTC | #1
On Tue, Aug 20, 2024 at 01:35:42PM +0900, Suleiman Souhlal wrote:
>When the host resumes from a suspend, the guest thinks any task
>that was running during the suspend ran for a long time, even though
>the effective run time was much shorter, which can end up having
>negative effects with scheduling. This can be particularly noticeable
>if the guest task was RT, as it can end up getting throttled for a
>long time.
>
>To mitigate this issue, we include the time that the host was
>suspended in steal time, which lets the guest subtract the duration from
>the tasks' runtime.
>
>Note that the case of a suspend happening during a VM migration
>might not be accounted.
>
>Signed-off-by: Suleiman Souhlal <suleiman@google.com>
>---
> arch/x86/include/asm/kvm_host.h |  1 +
> arch/x86/kvm/x86.c              | 11 ++++++++++-
> 2 files changed, 11 insertions(+), 1 deletion(-)
>
>diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>index 4a68cb3eba78f8..728798decb6d12 100644
>--- a/arch/x86/include/asm/kvm_host.h
>+++ b/arch/x86/include/asm/kvm_host.h
>@@ -898,6 +898,7 @@ struct kvm_vcpu_arch {
> 		u8 preempted;
> 		u64 msr_val;
> 		u64 last_steal;
>+		u64 last_suspend_ns;
> 		struct gfn_to_hva_cache cache;
> 	} st;
> 
>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index 70219e4069874a..104f3d318026fa 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -3654,7 +3654,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> 	struct kvm_steal_time __user *st;
> 	struct kvm_memslots *slots;
> 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
>-	u64 steal;
>+	u64 steal, suspend_ns;
> 	u32 version;
> 
> 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
>@@ -3735,6 +3735,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> 	steal += current->sched_info.run_delay -
> 		vcpu->arch.st.last_steal;
> 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
>+	/*
>+	 * Include the time that the host was suspended in steal time.
>+	 * Note that the case of a suspend happening during a VM migration
>+	 * might not be accounted.
>+	 */
>+	suspend_ns = kvm_total_suspend_ns();
>+	steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
>+	vcpu->arch.st.last_suspend_ns = suspend_ns;

The document in patch 3 states:

  Time during which the vcpu is idle, will not be reported as steal time

I'm wondering if all host suspend time should be reported as steal time,
or if the suspend time during a vCPU halt should be excluded.

> 	unsafe_put_user(steal, &st->steal, out);
> 
> 	version += 1;
>@@ -12280,6 +12288,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> 
> 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
> 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
>+	vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();

is this necessary? I doubt this because KVM doesn't capture
current->sched_info.run_delay here.

> 	kvm_xen_init_vcpu(vcpu);
> 	vcpu_load(vcpu);
> 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
>-- 
>2.46.0.184.g6999bdac58-goog
>
Suleiman Souhlal Aug. 23, 2024, 4:17 a.m. UTC | #2
On Wed, Aug 21, 2024 at 3:31 PM Chao Gao <chao.gao@intel.com> wrote:
>
> On Tue, Aug 20, 2024 at 01:35:42PM +0900, Suleiman Souhlal wrote:
> >When the host resumes from a suspend, the guest thinks any task
> >that was running during the suspend ran for a long time, even though
> >the effective run time was much shorter, which can end up having
> >negative effects with scheduling. This can be particularly noticeable
> >if the guest task was RT, as it can end up getting throttled for a
> >long time.
> >
> >To mitigate this issue, we include the time that the host was
> >suspended in steal time, which lets the guest subtract the duration from
> >the tasks' runtime.
> >
> >Note that the case of a suspend happening during a VM migration
> >might not be accounted.
> >
> >Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> >---
> > arch/x86/include/asm/kvm_host.h |  1 +
> > arch/x86/kvm/x86.c              | 11 ++++++++++-
> > 2 files changed, 11 insertions(+), 1 deletion(-)
> >
> >diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >index 4a68cb3eba78f8..728798decb6d12 100644
> >--- a/arch/x86/include/asm/kvm_host.h
> >+++ b/arch/x86/include/asm/kvm_host.h
> >@@ -898,6 +898,7 @@ struct kvm_vcpu_arch {
> >               u8 preempted;
> >               u64 msr_val;
> >               u64 last_steal;
> >+              u64 last_suspend_ns;
> >               struct gfn_to_hva_cache cache;
> >       } st;
> >
> >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >index 70219e4069874a..104f3d318026fa 100644
> >--- a/arch/x86/kvm/x86.c
> >+++ b/arch/x86/kvm/x86.c
> >@@ -3654,7 +3654,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >       struct kvm_steal_time __user *st;
> >       struct kvm_memslots *slots;
> >       gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> >-      u64 steal;
> >+      u64 steal, suspend_ns;
> >       u32 version;
> >
> >       if (kvm_xen_msr_enabled(vcpu->kvm)) {
> >@@ -3735,6 +3735,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >       steal += current->sched_info.run_delay -
> >               vcpu->arch.st.last_steal;
> >       vcpu->arch.st.last_steal = current->sched_info.run_delay;
> >+      /*
> >+       * Include the time that the host was suspended in steal time.
> >+       * Note that the case of a suspend happening during a VM migration
> >+       * might not be accounted.
> >+       */
> >+      suspend_ns = kvm_total_suspend_ns();
> >+      steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
> >+      vcpu->arch.st.last_suspend_ns = suspend_ns;
>
> The document in patch 3 states:
>
>   Time during which the vcpu is idle, will not be reported as steal time
>
> I'm wondering if all host suspend time should be reported as steal time,
> or if the suspend time during a vCPU halt should be excluded.

I think the statement about idle time not being reported as steal isn't
completely accurate, so I'm not sure if it's worth the extra complexity.

>
> >       unsafe_put_user(steal, &st->steal, out);
> >
> >       version += 1;
> >@@ -12280,6 +12288,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> >
> >       vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
> >       vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
> >+      vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();
>
> is this necessary? I doubt this because KVM doesn't capture
> current->sched_info.run_delay here.

Isn't run_delay being captured by the scheduler at all time?

We need to initialize last_suspend_ns otherwise the first call to
record_steal_time() for a VCPU would report a wrong value if
the VCPU is started after the host has already had a suspend.

Thanks,
-- Suleiman
Chao Gao Aug. 23, 2024, 5:25 a.m. UTC | #3
On Fri, Aug 23, 2024 at 01:17:31PM +0900, Suleiman Souhlal wrote:
>On Wed, Aug 21, 2024 at 3:31 PM Chao Gao <chao.gao@intel.com> wrote:
>>
>> On Tue, Aug 20, 2024 at 01:35:42PM +0900, Suleiman Souhlal wrote:
>> >When the host resumes from a suspend, the guest thinks any task
>> >that was running during the suspend ran for a long time, even though
>> >the effective run time was much shorter, which can end up having
>> >negative effects with scheduling. This can be particularly noticeable
>> >if the guest task was RT, as it can end up getting throttled for a
>> >long time.
>> >
>> >To mitigate this issue, we include the time that the host was
>> >suspended in steal time, which lets the guest subtract the duration from
>> >the tasks' runtime.
>> >
>> >Note that the case of a suspend happening during a VM migration
>> >might not be accounted.
>> >
>> >Signed-off-by: Suleiman Souhlal <suleiman@google.com>
>> >---
>> > arch/x86/include/asm/kvm_host.h |  1 +
>> > arch/x86/kvm/x86.c              | 11 ++++++++++-
>> > 2 files changed, 11 insertions(+), 1 deletion(-)
>> >
>> >diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> >index 4a68cb3eba78f8..728798decb6d12 100644
>> >--- a/arch/x86/include/asm/kvm_host.h
>> >+++ b/arch/x86/include/asm/kvm_host.h
>> >@@ -898,6 +898,7 @@ struct kvm_vcpu_arch {
>> >               u8 preempted;
>> >               u64 msr_val;
>> >               u64 last_steal;
>> >+              u64 last_suspend_ns;
>> >               struct gfn_to_hva_cache cache;
>> >       } st;
>> >
>> >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >index 70219e4069874a..104f3d318026fa 100644
>> >--- a/arch/x86/kvm/x86.c
>> >+++ b/arch/x86/kvm/x86.c
>> >@@ -3654,7 +3654,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>> >       struct kvm_steal_time __user *st;
>> >       struct kvm_memslots *slots;
>> >       gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
>> >-      u64 steal;
>> >+      u64 steal, suspend_ns;
>> >       u32 version;
>> >
>> >       if (kvm_xen_msr_enabled(vcpu->kvm)) {
>> >@@ -3735,6 +3735,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>> >       steal += current->sched_info.run_delay -
>> >               vcpu->arch.st.last_steal;
>> >       vcpu->arch.st.last_steal = current->sched_info.run_delay;
>> >+      /*
>> >+       * Include the time that the host was suspended in steal time.
>> >+       * Note that the case of a suspend happening during a VM migration
>> >+       * might not be accounted.
>> >+       */
>> >+      suspend_ns = kvm_total_suspend_ns();
>> >+      steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
>> >+      vcpu->arch.st.last_suspend_ns = suspend_ns;
>>
>> The document in patch 3 states:
>>
>>   Time during which the vcpu is idle, will not be reported as steal time
>>
>> I'm wondering if all host suspend time should be reported as steal time,
>> or if the suspend time during a vCPU halt should be excluded.
>
>I think the statement about idle time not being reported as steal isn't
>completely accurate, so I'm not sure if it's worth the extra complexity.
>
>>
>> >       unsafe_put_user(steal, &st->steal, out);
>> >
>> >       version += 1;
>> >@@ -12280,6 +12288,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>> >
>> >       vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
>> >       vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
>> >+      vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();
>>
>> is this necessary? I doubt this because KVM doesn't capture
>> current->sched_info.run_delay here.
>
>Isn't run_delay being captured by the scheduler at all time?

I meant KVM doesn't do:

	vcpu->arch.st.last_steal = current->sched_info.run_delay;

at vCPU creation time.

>
>We need to initialize last_suspend_ns otherwise the first call to
>record_steal_time() for a VCPU would report a wrong value if
>the VCPU is started after the host has already had a suspend.

But initializing last_suspend_ns here doesn't guarantee KVM won't report a
"wrong" value because a suspend can happen after vCPU creation and before
its first VM-enter.
Suleiman Souhlal Aug. 23, 2024, 5:43 a.m. UTC | #4
On Fri, Aug 23, 2024 at 2:25 PM Chao Gao <chao.gao@intel.com> wrote:
>
> On Fri, Aug 23, 2024 at 01:17:31PM +0900, Suleiman Souhlal wrote:
> >On Wed, Aug 21, 2024 at 3:31 PM Chao Gao <chao.gao@intel.com> wrote:
> >>
> >> On Tue, Aug 20, 2024 at 01:35:42PM +0900, Suleiman Souhlal wrote:
> >> >When the host resumes from a suspend, the guest thinks any task
> >> >that was running during the suspend ran for a long time, even though
> >> >the effective run time was much shorter, which can end up having
> >> >negative effects with scheduling. This can be particularly noticeable
> >> >if the guest task was RT, as it can end up getting throttled for a
> >> >long time.
> >> >
> >> >To mitigate this issue, we include the time that the host was
> >> >suspended in steal time, which lets the guest subtract the duration from
> >> >the tasks' runtime.
> >> >
> >> >Note that the case of a suspend happening during a VM migration
> >> >might not be accounted.
> >> >
> >> >Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> >> >---
> >> > arch/x86/include/asm/kvm_host.h |  1 +
> >> > arch/x86/kvm/x86.c              | 11 ++++++++++-
> >> > 2 files changed, 11 insertions(+), 1 deletion(-)
> >> >
> >> >diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> >index 4a68cb3eba78f8..728798decb6d12 100644
> >> >--- a/arch/x86/include/asm/kvm_host.h
> >> >+++ b/arch/x86/include/asm/kvm_host.h
> >> >@@ -898,6 +898,7 @@ struct kvm_vcpu_arch {
> >> >               u8 preempted;
> >> >               u64 msr_val;
> >> >               u64 last_steal;
> >> >+              u64 last_suspend_ns;
> >> >               struct gfn_to_hva_cache cache;
> >> >       } st;
> >> >
> >> >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> >index 70219e4069874a..104f3d318026fa 100644
> >> >--- a/arch/x86/kvm/x86.c
> >> >+++ b/arch/x86/kvm/x86.c
> >> >@@ -3654,7 +3654,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >> >       struct kvm_steal_time __user *st;
> >> >       struct kvm_memslots *slots;
> >> >       gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> >> >-      u64 steal;
> >> >+      u64 steal, suspend_ns;
> >> >       u32 version;
> >> >
> >> >       if (kvm_xen_msr_enabled(vcpu->kvm)) {
> >> >@@ -3735,6 +3735,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >> >       steal += current->sched_info.run_delay -
> >> >               vcpu->arch.st.last_steal;
> >> >       vcpu->arch.st.last_steal = current->sched_info.run_delay;
> >> >+      /*
> >> >+       * Include the time that the host was suspended in steal time.
> >> >+       * Note that the case of a suspend happening during a VM migration
> >> >+       * might not be accounted.
> >> >+       */
> >> >+      suspend_ns = kvm_total_suspend_ns();
> >> >+      steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
> >> >+      vcpu->arch.st.last_suspend_ns = suspend_ns;
> >>
> >> The document in patch 3 states:
> >>
> >>   Time during which the vcpu is idle, will not be reported as steal time
> >>
> >> I'm wondering if all host suspend time should be reported as steal time,
> >> or if the suspend time during a vCPU halt should be excluded.
> >
> >I think the statement about idle time not being reported as steal isn't
> >completely accurate, so I'm not sure if it's worth the extra complexity.
> >
> >>
> >> >       unsafe_put_user(steal, &st->steal, out);
> >> >
> >> >       version += 1;
> >> >@@ -12280,6 +12288,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> >> >
> >> >       vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
> >> >       vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
> >> >+      vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();
> >>
> >> is this necessary? I doubt this because KVM doesn't capture
> >> current->sched_info.run_delay here.
> >
> >Isn't run_delay being captured by the scheduler at all time?
>
> I meant KVM doesn't do:
>
>         vcpu->arch.st.last_steal = current->sched_info.run_delay;
>
> at vCPU creation time.

I think for run_delay it's different because run_delay is a time
difference. It's something that gets added to steal, not relative
to the previous steal value.
From what I can tell, it's correct for last_steal to be initialized to 0.

>
> >
> >We need to initialize last_suspend_ns otherwise the first call to
> >record_steal_time() for a VCPU would report a wrong value if
> >the VCPU is started after the host has already had a suspend.
>
> But initializing last_suspend_ns here doesn't guarantee KVM won't report a
> "wrong" value because a suspend can happen after vCPU creation and before
> its first VM-enter.

I see what you're saying.
I'm not sure how much this matters in practice.

-- Suleiman
Suleiman Souhlal Aug. 28, 2024, 9:56 a.m. UTC | #5
On Tue, Aug 20, 2024 at 1:38 PM Suleiman Souhlal <suleiman@google.com> wrote:
> @@ -3735,6 +3735,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>         steal += current->sched_info.run_delay -
>                 vcpu->arch.st.last_steal;
>         vcpu->arch.st.last_steal = current->sched_info.run_delay;
> +       /*
> +        * Include the time that the host was suspended in steal time.
> +        * Note that the case of a suspend happening during a VM migration
> +        * might not be accounted.
> +        */
> +       suspend_ns = kvm_total_suspend_ns();
> +       steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
> +       vcpu->arch.st.last_suspend_ns = suspend_ns;
>         unsafe_put_user(steal, &st->steal, out);
>
>         version += 1;

There is an issue here: We are calling a function under UACCESS, which
raises an objtool warning.
I'll be sending a v3 with that addressed (and the function return
value wrapping in patch 1/3).

-- Suleiman
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4a68cb3eba78f8..728798decb6d12 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -898,6 +898,7 @@  struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
+		u64 last_suspend_ns;
 		struct gfn_to_hva_cache cache;
 	} st;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70219e4069874a..104f3d318026fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3654,7 +3654,7 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 	struct kvm_steal_time __user *st;
 	struct kvm_memslots *slots;
 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
-	u64 steal;
+	u64 steal, suspend_ns;
 	u32 version;
 
 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
@@ -3735,6 +3735,14 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	/*
+	 * Include the time that the host was suspended in steal time.
+	 * Note that the case of a suspend happening during a VM migration
+	 * might not be accounted.
+	 */
+	suspend_ns = kvm_total_suspend_ns();
+	steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
+	vcpu->arch.st.last_suspend_ns = suspend_ns;
 	unsafe_put_user(steal, &st->steal, out);
 
 	version += 1;
@@ -12280,6 +12288,7 @@  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+	vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();
 	kvm_xen_init_vcpu(vcpu);
 	vcpu_load(vcpu);
 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);