diff mbox series

[v3,2/3] KVM: x86: Include host suspended time in steal time.

Message ID 20250107042202.2554063-3-suleiman@google.com (mailing list archive)
State New
Headers show
Series KVM: x86: Include host suspended time in steal time. | expand

Commit Message

Suleiman Souhlal Jan. 7, 2025, 4:22 a.m. UTC
When the host resumes from a suspend, the guest thinks any task
that was running during the suspend ran for a long time, even though
the effective run time was much shorter, which can end up having
negative effects with scheduling. This can be particularly noticeable
if the guest task was RT, as it can end up getting throttled for a
long time.

To mitigate this issue, we include the time that the host was
suspended in steal time, which lets the guest can subtract the
duration from the tasks' runtime.

Note that the case of a suspend happening during a VM migration
might not be accounted.

Change-Id: I18d1d17d4d0d6f4c89b312e427036e052c47e1fa
Signed-off-by: Suleiman Souhlal <suleiman@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

Comments

Sean Christopherson Jan. 7, 2025, 3:37 p.m. UTC | #1
On Tue, Jan 07, 2025, Suleiman Souhlal wrote:
> When the host resumes from a suspend, the guest thinks any task
> that was running during the suspend ran for a long time, even though
> the effective run time was much shorter, which can end up having
> negative effects with scheduling. This can be particularly noticeable
> if the guest task was RT, as it can end up getting throttled for a
> long time.
> 
> To mitigate this issue, we include the time that the host was

No "we".

> suspended in steal time, which lets the guest can subtract the
> duration from the tasks' runtime.
> 
> Note that the case of a suspend happening during a VM migration
> might not be accounted.

And this isn't considered a bug because?  I asked for documentation, not a
statement of fact.

> Change-Id: I18d1d17d4d0d6f4c89b312e427036e052c47e1fa

gerrit.

> Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/x86.c              | 11 ++++++++++-
>  2 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e159e44a6a1b61..01d44d527a7f88 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -897,6 +897,7 @@ struct kvm_vcpu_arch {
>  		u8 preempted;
>  		u64 msr_val;
>  		u64 last_steal;
> +		u64 last_suspend_ns;
>  		struct gfn_to_hva_cache cache;
>  	} st;
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c8160baf383851..12439edc36f83a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3650,7 +3650,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>  	struct kvm_steal_time __user *st;
>  	struct kvm_memslots *slots;
>  	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> -	u64 steal;
> +	u64 steal, suspend_ns;
>  	u32 version;
>  
>  	if (kvm_xen_msr_enabled(vcpu->kvm)) {
> @@ -3677,6 +3677,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>  			return;
>  	}
>  
> +	suspend_ns = kvm_total_suspend_ns();
>  	st = (struct kvm_steal_time __user *)ghc->hva;
>  	/*
>  	 * Doing a TLB flush here, on the guest's behalf, can avoid
> @@ -3731,6 +3732,13 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>  	steal += current->sched_info.run_delay -
>  		vcpu->arch.st.last_steal;
>  	vcpu->arch.st.last_steal = current->sched_info.run_delay;
> +	/*
> +	 * Include the time that the host was suspended in steal time.
> +	 * Note that the case of a suspend happening during a VM migration
> +	 * might not be accounted.
> +	 */

This is not a useful comment.  It's quite clear what that suspend time is being
accumulated into steal_time, and restating the migration caveat does more harm
than good, as that flaw is an issue with the overall design, i.e. has nothing to
do with this specific snippet of code.

> +	steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
> +	vcpu->arch.st.last_suspend_ns = suspend_ns;
>  	unsafe_put_user(steal, &st->steal, out);
>  
>  	version += 1;
> @@ -12299,6 +12307,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>  	if (r)
>  		goto free_guest_fpu;
>  
> +	vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();
>  	kvm_xen_init_vcpu(vcpu);
>  	vcpu_load(vcpu);
>  	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
> -- 
> 2.47.1.613.gc27f4b7a9f-goog
>
Suleiman Souhlal Jan. 8, 2025, 4:05 a.m. UTC | #2
On Wed, Jan 8, 2025 at 12:37 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jan 07, 2025, Suleiman Souhlal wrote:
> > When the host resumes from a suspend, the guest thinks any task
> > that was running during the suspend ran for a long time, even though
> > the effective run time was much shorter, which can end up having
> > negative effects with scheduling. This can be particularly noticeable
> > if the guest task was RT, as it can end up getting throttled for a
> > long time.
> >
> > To mitigate this issue, we include the time that the host was
>
> No "we".
>
> > suspended in steal time, which lets the guest can subtract the
> > duration from the tasks' runtime.
> >
> > Note that the case of a suspend happening during a VM migration
> > might not be accounted.
>
> And this isn't considered a bug because?  I asked for documentation, not a
> statement of fact.

I guess I don't really understand what the difference between
documentation and statements of fact is.

It's not completely clear to me what the desired behavior would be
when suspending during a VM migration.
If we wanted to inject the suspend duration that happened after the
migration started, but before it ended, I suppose we would need to add
a way for the new VM instance to add to steal time, possibly through a
new uAPI.

It is also not clear to me why we would want that.

>
> > Change-Id: I18d1d17d4d0d6f4c89b312e427036e052c47e1fa
>
> gerrit.
>
> > Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  1 +
> >  arch/x86/kvm/x86.c              | 11 ++++++++++-
> >  2 files changed, 11 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index e159e44a6a1b61..01d44d527a7f88 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -897,6 +897,7 @@ struct kvm_vcpu_arch {
> >               u8 preempted;
> >               u64 msr_val;
> >               u64 last_steal;
> > +             u64 last_suspend_ns;
> >               struct gfn_to_hva_cache cache;
> >       } st;
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index c8160baf383851..12439edc36f83a 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3650,7 +3650,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >       struct kvm_steal_time __user *st;
> >       struct kvm_memslots *slots;
> >       gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> > -     u64 steal;
> > +     u64 steal, suspend_ns;
> >       u32 version;
> >
> >       if (kvm_xen_msr_enabled(vcpu->kvm)) {
> > @@ -3677,6 +3677,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >                       return;
> >       }
> >
> > +     suspend_ns = kvm_total_suspend_ns();
> >       st = (struct kvm_steal_time __user *)ghc->hva;
> >       /*
> >        * Doing a TLB flush here, on the guest's behalf, can avoid
> > @@ -3731,6 +3732,13 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> >       steal += current->sched_info.run_delay -
> >               vcpu->arch.st.last_steal;
> >       vcpu->arch.st.last_steal = current->sched_info.run_delay;
> > +     /*
> > +      * Include the time that the host was suspended in steal time.
> > +      * Note that the case of a suspend happening during a VM migration
> > +      * might not be accounted.
> > +      */
>
> This is not a useful comment.  It's quite clear what that suspend time is being
> accumulated into steal_time, and restating the migration caveat does more harm
> than good, as that flaw is an issue with the overall design, i.e. has nothing to
> do with this specific snippet of code.

I will remove it.

Thanks,
-- Suleiman
Sean Christopherson Jan. 8, 2025, 3:17 p.m. UTC | #3
On Wed, Jan 08, 2025, Suleiman Souhlal wrote:
> On Wed, Jan 8, 2025 at 12:37 AM Sean Christopherson <seanjc@google.com> wrote:
> > On Tue, Jan 07, 2025, Suleiman Souhlal wrote:
> > > Note that the case of a suspend happening during a VM migration
> > > might not be accounted.
> >
> > And this isn't considered a bug because?  I asked for documentation, not a
> > statement of fact.
> 
> I guess I don't really understand what the difference between documentation
> and statements of fact is.

It's the difference between "X may be buggy" and "X has this exact caveat because
KVM doesn't support saving/restoring associated metadata".  For this case, I'm
pretty sure it's possible to document exactly when suspended time will be lost,
what may happen if that accounted time is lost, what userspace could do to avoid
dropping suspend time, and finally for the changelog only, specifically why KVM
support for accounting suspended time is *intentionally* not providing APIs to
save/restore pending suspended time.

> It's not completely clear to me what the desired behavior would be when
> suspending during a VM migration.

That's fine, it's not your (or KVM's) responsibility to know the desired
behavior for every possible/theoretical use case.  But as above, it *is* KVM's
responsibility to properly document any caveats with functionality.

> If we wanted to inject the suspend duration that happened after the migration
> started, but before it ended, I suppose we would need to add a way for the
> new VM instance to add to steal time, possibly through a new uAPI.

It's not just migration, and it's not just the case where the host is suspended
*after* vCPU state is saved.  Pending suspended time will also be lost if vCPU
state is saved after suspend+resume without entering the guest (because the
accounting is done late in KVM_RUN).

> It is also not clear to me why we would want that.

Which is fine as well.  If some crazy use case cares about accounting suspended
time across save+restore, then the onus is on that use case to implement and
justify appropriate support.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e159e44a6a1b61..01d44d527a7f88 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -897,6 +897,7 @@  struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
+		u64 last_suspend_ns;
 		struct gfn_to_hva_cache cache;
 	} st;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c8160baf383851..12439edc36f83a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3650,7 +3650,7 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 	struct kvm_steal_time __user *st;
 	struct kvm_memslots *slots;
 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
-	u64 steal;
+	u64 steal, suspend_ns;
 	u32 version;
 
 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
@@ -3677,6 +3677,7 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 			return;
 	}
 
+	suspend_ns = kvm_total_suspend_ns();
 	st = (struct kvm_steal_time __user *)ghc->hva;
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
@@ -3731,6 +3732,13 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	/*
+	 * Include the time that the host was suspended in steal time.
+	 * Note that the case of a suspend happening during a VM migration
+	 * might not be accounted.
+	 */
+	steal += suspend_ns - vcpu->arch.st.last_suspend_ns;
+	vcpu->arch.st.last_suspend_ns = suspend_ns;
 	unsafe_put_user(steal, &st->steal, out);
 
 	version += 1;
@@ -12299,6 +12307,7 @@  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	if (r)
 		goto free_guest_fpu;
 
+	vcpu->arch.st.last_suspend_ns = kvm_total_suspend_ns();
 	kvm_xen_init_vcpu(vcpu);
 	vcpu_load(vcpu);
 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);