diff mbox series

KVM: x86: Include host suspended time in steal time.

Message ID 20240710074410.770409-1-suleiman@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: Include host suspended time in steal time. | expand

Commit Message

Suleiman Souhlal July 10, 2024, 7:44 a.m. UTC
When the host resumes from a suspend, the guest thinks any task
that was running during the suspend ran for a long time, even though
the effective run time was much shorter, which can end up having
negative effects with scheduling. This can be particularly noticeable
if the guest task was RT, as it can end up getting throttled for a
long time.

To mitigate this issue, we include the time that the host was
suspended in steal time, which lets the guest can subtract the
duration from the tasks' runtime.

Signed-off-by: Suleiman Souhlal <suleiman@google.com>
---
 arch/x86/kvm/x86.c       | 23 ++++++++++++++++++++++-
 include/linux/kvm_host.h |  4 ++++
 2 files changed, 26 insertions(+), 1 deletion(-)

Comments

Suleiman Souhlal July 29, 2024, 7:26 a.m. UTC | #1
Hello,

On Wed, Jul 10, 2024 at 4:44 PM Suleiman Souhlal <suleiman@google.com> wrote:
>
> When the host resumes from a suspend, the guest thinks any task
> that was running during the suspend ran for a long time, even though
> the effective run time was much shorter, which can end up having
> negative effects with scheduling. This can be particularly noticeable
> if the guest task was RT, as it can end up getting throttled for a
> long time.
>
> To mitigate this issue, we include the time that the host was
> suspended in steal time, which lets the guest can subtract the
> duration from the tasks' runtime.
>
> Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> ---
>  arch/x86/kvm/x86.c       | 23 ++++++++++++++++++++++-
>  include/linux/kvm_host.h |  4 ++++
>  2 files changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0763a0f72a067f..94bbdeef843863 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>         struct kvm_steal_time __user *st;
>         struct kvm_memslots *slots;
>         gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> -       u64 steal;
> +       u64 steal, suspend_duration;
>         u32 version;
>
>         if (kvm_xen_msr_enabled(vcpu->kvm)) {
> @@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>                         return;
>         }
>
> +       suspend_duration = 0;
> +       if (READ_ONCE(vcpu->suspended)) {
> +               suspend_duration = vcpu->kvm->last_suspend_duration;
> +               vcpu->suspended = 0;
> +       }
> +
>         st = (struct kvm_steal_time __user *)ghc->hva;
>         /*
>          * Doing a TLB flush here, on the guest's behalf, can avoid
> @@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>         unsafe_get_user(steal, &st->steal, out);
>         steal += current->sched_info.run_delay -
>                 vcpu->arch.st.last_steal;
> +       steal += suspend_duration;
>         vcpu->arch.st.last_steal = current->sched_info.run_delay;
>         unsafe_put_user(steal, &st->steal, out);
>
> @@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
>
>         mutex_lock(&kvm->lock);
>         kvm_for_each_vcpu(i, vcpu, kvm) {
> +               WRITE_ONCE(vcpu->suspended, 1);
>                 if (!vcpu->arch.pv_time.active)
>                         continue;
>
> @@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
>         }
>         mutex_unlock(&kvm->lock);
>
> +       kvm->suspended_time = ktime_get_boottime_ns();
> +
>         return ret ? NOTIFY_BAD : NOTIFY_DONE;
>  }
>
> +static int
> +kvm_arch_resume_notifier(struct kvm *kvm)
> +{
> +       kvm->last_suspend_duration = ktime_get_boottime_ns() -
> +           kvm->suspended_time;
> +       return NOTIFY_DONE;
> +}
> +
>  int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  {
>         switch (state) {
>         case PM_HIBERNATION_PREPARE:
>         case PM_SUSPEND_PREPARE:
>                 return kvm_arch_suspend_notifier(kvm);
> +       case PM_POST_HIBERNATION:
> +       case PM_POST_SUSPEND:
> +               return kvm_arch_resume_notifier(kvm);
>         }
>
>         return NOTIFY_DONE;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 692c01e41a18ef..2d37af9a348648 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -366,6 +366,8 @@ struct kvm_vcpu {
>         } async_pf;
>  #endif
>
> +       bool suspended;
> +
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>         /*
>          * Cpu relax intercept or pause loop exit optimization
> @@ -840,6 +842,8 @@ struct kvm {
>         struct xarray mem_attr_array;
>  #endif
>         char stats_id[KVM_STATS_NAME_SIZE];
> +       u64 last_suspend_duration;
> +       u64 suspended_time;
>  };
>
>  #define kvm_err(fmt, ...) \
> --
> 2.45.2.993.g49e7a77208-goog
>

Gentle ping.

Thanks,
-- Suleiman
Chao Gao July 30, 2024, 2:26 a.m. UTC | #2
Hi,

On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote:
>When the host resumes from a suspend, the guest thinks any task
>that was running during the suspend ran for a long time, even though
>the effective run time was much shorter, which can end up having
>negative effects with scheduling. This can be particularly noticeable
>if the guest task was RT, as it can end up getting throttled for a
>long time.
>
>To mitigate this issue, we include the time that the host was
>suspended in steal time, which lets the guest can subtract the
>duration from the tasks' runtime.
>
>Signed-off-by: Suleiman Souhlal <suleiman@google.com>
>---
> arch/x86/kvm/x86.c       | 23 ++++++++++++++++++++++-
> include/linux/kvm_host.h |  4 ++++
> 2 files changed, 26 insertions(+), 1 deletion(-)
>
>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index 0763a0f72a067f..94bbdeef843863 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> 	struct kvm_steal_time __user *st;
> 	struct kvm_memslots *slots;
> 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
>-	u64 steal;
>+	u64 steal, suspend_duration;
> 	u32 version;
> 
> 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
>@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> 			return;
> 	}
> 
>+	suspend_duration = 0;
>+	if (READ_ONCE(vcpu->suspended)) {
>+		suspend_duration = vcpu->kvm->last_suspend_duration;
>+		vcpu->suspended = 0;

Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used
for clearing vcpu->suspended?

>+	}
>+
> 	st = (struct kvm_steal_time __user *)ghc->hva;
> 	/*
> 	 * Doing a TLB flush here, on the guest's behalf, can avoid
>@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> 	unsafe_get_user(steal, &st->steal, out);
> 	steal += current->sched_info.run_delay -
> 		vcpu->arch.st.last_steal;
>+	steal += suspend_duration;
> 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
> 	unsafe_put_user(steal, &st->steal, out);
> 
>@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> 
> 	mutex_lock(&kvm->lock);
> 	kvm_for_each_vcpu(i, vcpu, kvm) {
>+		WRITE_ONCE(vcpu->suspended, 1);
> 		if (!vcpu->arch.pv_time.active)
> 			continue;
> 
>@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> 	}
> 	mutex_unlock(&kvm->lock);
> 
>+	kvm->suspended_time = ktime_get_boottime_ns();
>+
> 	return ret ? NOTIFY_BAD : NOTIFY_DONE;
> }
> 
>+static int
>+kvm_arch_resume_notifier(struct kvm *kvm)
>+{
>+	kvm->last_suspend_duration = ktime_get_boottime_ns() -
>+	    kvm->suspended_time;

Is it possible that a vCPU doesn't get any chance to run (i.e., update steal
time) between two suspends? In this case, only the second suspend would be
recorded.

Maybe we need an infrastructure in the PM subsystem to record accumulated
suspended time. When updating steal time, KVM can add the additional suspended
time since the last update into steal_time (as how KVM deals with
current->sched_info.run_deley). This way, the scenario I mentioned above won't
be a problem and KVM needn't calculate the suspend duration for each guest. And
this approach can potentially benefit RISC-V and ARM as well, since they have
the same logic as x86 regarding steal_time.

Additionally, it seems that if a guest migrates to another system after a suspend
and before updating steal time, the suspended time is lost during migration. I'm
not sure if this is a practical issue.
Suleiman Souhlal July 30, 2024, 9:45 a.m. UTC | #3
On Tue, Jul 30, 2024 at 10:26:30AM +0800, Chao Gao wrote:
> Hi,
> 
> On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote:
> >When the host resumes from a suspend, the guest thinks any task
> >that was running during the suspend ran for a long time, even though
> >the effective run time was much shorter, which can end up having
> >negative effects with scheduling. This can be particularly noticeable
> >if the guest task was RT, as it can end up getting throttled for a
> >long time.
> >
> >To mitigate this issue, we include the time that the host was
> >suspended in steal time, which lets the guest can subtract the
> >duration from the tasks' runtime.
> >
> >Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> >---
> > arch/x86/kvm/x86.c       | 23 ++++++++++++++++++++++-
> > include/linux/kvm_host.h |  4 ++++
> > 2 files changed, 26 insertions(+), 1 deletion(-)
> >
> >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >index 0763a0f72a067f..94bbdeef843863 100644
> >--- a/arch/x86/kvm/x86.c
> >+++ b/arch/x86/kvm/x86.c
> >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > 	struct kvm_steal_time __user *st;
> > 	struct kvm_memslots *slots;
> > 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> >-	u64 steal;
> >+	u64 steal, suspend_duration;
> > 	u32 version;
> > 
> > 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
> >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > 			return;
> > 	}
> > 
> >+	suspend_duration = 0;
> >+	if (READ_ONCE(vcpu->suspended)) {
> >+		suspend_duration = vcpu->kvm->last_suspend_duration;
> >+		vcpu->suspended = 0;
> 
> Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used
> for clearing vcpu->suspended?

Because this part of the code is essentially single-threaded, it didn't seem
like WRITE_ONCE() was needed. I can add it in an eventual future version of
the patch if it makes things less confusing (if this code still exists).

> 
> >+	}
> >+
> > 	st = (struct kvm_steal_time __user *)ghc->hva;
> > 	/*
> > 	 * Doing a TLB flush here, on the guest's behalf, can avoid
> >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > 	unsafe_get_user(steal, &st->steal, out);
> > 	steal += current->sched_info.run_delay -
> > 		vcpu->arch.st.last_steal;
> >+	steal += suspend_duration;
> > 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
> > 	unsafe_put_user(steal, &st->steal, out);
> > 
> >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> > 
> > 	mutex_lock(&kvm->lock);
> > 	kvm_for_each_vcpu(i, vcpu, kvm) {
> >+		WRITE_ONCE(vcpu->suspended, 1);
> > 		if (!vcpu->arch.pv_time.active)
> > 			continue;
> > 
> >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> > 	}
> > 	mutex_unlock(&kvm->lock);
> > 
> >+	kvm->suspended_time = ktime_get_boottime_ns();
> >+
> > 	return ret ? NOTIFY_BAD : NOTIFY_DONE;
> > }
> > 
> >+static int
> >+kvm_arch_resume_notifier(struct kvm *kvm)
> >+{
> >+	kvm->last_suspend_duration = ktime_get_boottime_ns() -
> >+	    kvm->suspended_time;
> 
> Is it possible that a vCPU doesn't get any chance to run (i.e., update steal
> time) between two suspends? In this case, only the second suspend would be
> recorded.

Good point. I'll address this.

> 
> Maybe we need an infrastructure in the PM subsystem to record accumulated
> suspended time. When updating steal time, KVM can add the additional suspended
> time since the last update into steal_time (as how KVM deals with
> current->sched_info.run_deley). This way, the scenario I mentioned above won't
> be a problem and KVM needn't calculate the suspend duration for each guest. And
> this approach can potentially benefit RISC-V and ARM as well, since they have
> the same logic as x86 regarding steal_time.

Thanks for the suggestion.
I'm a bit wary of making a whole PM subsystem addition for such a counter, but
maybe I can make a architecture-independent KVM change for it, with a PM
notifier in kvm_main.c.

> 
> Additionally, it seems that if a guest migrates to another system after a suspend
> and before updating steal time, the suspended time is lost during migration. I'm
> not sure if this is a practical issue.

The systems where the host suspends don't usually do VM migrations. Or at least
the ones where we're encountering the problem this patch is trying to address
don't (laptops).
But even if they did, it doesn't seem that likely that the migration would
happen over a host suspend.
If it's ok with you, I'll put this issue aside for the time being.

Thanks for the comments.
-- Suleiman
Chao Gao July 31, 2024, 1:03 a.m. UTC | #4
>> >+static int
>> >+kvm_arch_resume_notifier(struct kvm *kvm)
>> >+{
>> >+	kvm->last_suspend_duration = ktime_get_boottime_ns() -
>> >+	    kvm->suspended_time;
>> 
>> Is it possible that a vCPU doesn't get any chance to run (i.e., update steal
>> time) between two suspends? In this case, only the second suspend would be
>> recorded.
>
>Good point. I'll address this.
>
>> 
>> Maybe we need an infrastructure in the PM subsystem to record accumulated
>> suspended time. When updating steal time, KVM can add the additional suspended
>> time since the last update into steal_time (as how KVM deals with
>> current->sched_info.run_deley). This way, the scenario I mentioned above won't
>> be a problem and KVM needn't calculate the suspend duration for each guest. And
>> this approach can potentially benefit RISC-V and ARM as well, since they have
>> the same logic as x86 regarding steal_time.
>
>Thanks for the suggestion.
>I'm a bit wary of making a whole PM subsystem addition for such a counter, but
>maybe I can make a architecture-independent KVM change for it, with a PM
>notifier in kvm_main.c.

Sounds good.

>
>> 
>> Additionally, it seems that if a guest migrates to another system after a suspend
>> and before updating steal time, the suspended time is lost during migration. I'm
>> not sure if this is a practical issue.
>
>The systems where the host suspends don't usually do VM migrations. Or at least
>the ones where we're encountering the problem this patch is trying to address
>don't (laptops).
>But even if they did, it doesn't seem that likely that the migration would
>happen over a host suspend.
>If it's ok with you, I'll put this issue aside for the time being.

I am fine with putting this issue aside.
Sean Christopherson Aug. 13, 2024, 5:06 p.m. UTC | #5
On Tue, Jul 30, 2024, Suleiman Souhlal wrote:
> On Tue, Jul 30, 2024 at 10:26:30AM +0800, Chao Gao wrote:
> > Hi,
> > 
> > On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote:
> > >When the host resumes from a suspend, the guest thinks any task
> > >that was running during the suspend ran for a long time, even though
> > >the effective run time was much shorter, which can end up having
> > >negative effects with scheduling. This can be particularly noticeable
> > >if the guest task was RT, as it can end up getting throttled for a
> > >long time.
> > >
> > >To mitigate this issue, we include the time that the host was
> > >suspended in steal time, which lets the guest can subtract the
> > >duration from the tasks' runtime.
> > >
> > >Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> > >---
> > > arch/x86/kvm/x86.c       | 23 ++++++++++++++++++++++-
> > > include/linux/kvm_host.h |  4 ++++
> > > 2 files changed, 26 insertions(+), 1 deletion(-)
> > >
> > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > >index 0763a0f72a067f..94bbdeef843863 100644
> > >--- a/arch/x86/kvm/x86.c
> > >+++ b/arch/x86/kvm/x86.c
> > >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > > 	struct kvm_steal_time __user *st;
> > > 	struct kvm_memslots *slots;
> > > 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> > >-	u64 steal;
> > >+	u64 steal, suspend_duration;
> > > 	u32 version;
> > > 
> > > 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
> > >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > > 			return;
> > > 	}
> > > 
> > >+	suspend_duration = 0;
> > >+	if (READ_ONCE(vcpu->suspended)) {
> > >+		suspend_duration = vcpu->kvm->last_suspend_duration;
> > >+		vcpu->suspended = 0;
> > 
> > Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used
> > for clearing vcpu->suspended?
> 
> Because this part of the code is essentially single-threaded, it didn't seem
> like WRITE_ONCE() was needed. I can add it in an eventual future version of
> the patch if it makes things less confusing (if this code still exists).

{READ,WRITE}_ONCE() don't provide ordering, they only ensure that the compiler
emits a load/store.  But (a) forcing the compiler to emit a load/store is only
necessary when it's possible for the compiler to know/prove that the load/store
won't affect functionalty, and (b) this code doesn't have have the correct
ordering even if there were the appropriate smp_wmb() and smp_rmb() annotations.

The writer needs to ensure the write to last_suspend_duration is visible before
vcpu->suspended is set, and the reader needs to ensure it reads last_suspend_duration
after vcpu->suspended.

That said, it's likely a moot point, because I assume the PM notifier runs while
tasks are frozen, i.e. its writes are guaranteed to be ordered before
record_steal_time().

And _that_ said, I don't see any reason for two fields, nor do I see any reason
to track the suspended duration in the VM instead of the vCPU.  Similar to what
Chao pointed out, per-VM tracking falls apart if only some vCPUs consume the
suspend duration.  I.e. just accumulate the suspend duration directly in the
vCPU.

> > >+	}
> > >+
> > > 	st = (struct kvm_steal_time __user *)ghc->hva;
> > > 	/*
> > > 	 * Doing a TLB flush here, on the guest's behalf, can avoid
> > >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > > 	unsafe_get_user(steal, &st->steal, out);
> > > 	steal += current->sched_info.run_delay -
> > > 		vcpu->arch.st.last_steal;
> > >+	steal += suspend_duration;
> > > 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
> > > 	unsafe_put_user(steal, &st->steal, out);
> > > 
> > >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> > > 
> > > 	mutex_lock(&kvm->lock);
> > > 	kvm_for_each_vcpu(i, vcpu, kvm) {
> > >+		WRITE_ONCE(vcpu->suspended, 1);
> > > 		if (!vcpu->arch.pv_time.active)
> > > 			continue;
> > > 
> > >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> > > 	}
> > > 	mutex_unlock(&kvm->lock);
> > > 
> > >+	kvm->suspended_time = ktime_get_boottime_ns();
> > >+
> > > 	return ret ? NOTIFY_BAD : NOTIFY_DONE;
> > > }
> > > 
> > >+static int
> > >+kvm_arch_resume_notifier(struct kvm *kvm)

Do not wrap before the function name.  Linus has a nice explanation/rant on this[*].

[*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@mail.gmail.com

> > >+{
> > >+	kvm->last_suspend_duration = ktime_get_boottime_ns() -
> > >+	    kvm->suspended_time;
> > 
> > Is it possible that a vCPU doesn't get any chance to run (i.e., update steal
> > time) between two suspends? In this case, only the second suspend would be
> > recorded.
> 
> Good point. I'll address this.
> 
> > 
> > Maybe we need an infrastructure in the PM subsystem to record accumulated
> > suspended time. When updating steal time, KVM can add the additional suspended
> > time since the last update into steal_time (as how KVM deals with
> > current->sched_info.run_deley). This way, the scenario I mentioned above won't
> > be a problem and KVM needn't calculate the suspend duration for each guest. And
> > this approach can potentially benefit RISC-V and ARM as well, since they have
> > the same logic as x86 regarding steal_time.
> 
> Thanks for the suggestion.
> I'm a bit wary of making a whole PM subsystem addition for such a counter, but
> maybe I can make a architecture-independent KVM change for it, with a PM
> notifier in kvm_main.c.

Yes.  Either that, or the fields need to be in kvm_vcpu_arch, not kvm_vcpu.

> > Additionally, it seems that if a guest migrates to another system after a
> > suspend and before updating steal time, the suspended time is lost during
> > migration. I'm not sure if this is a practical issue.
> 
> The systems where the host suspends don't usually do VM migrations. Or at
> least the ones where we're encountering the problem this patch is trying to
> address don't (laptops).
>
> But even if they did, it doesn't seem that likely that the migration would
> happen over a host suspend.

I think we want to account for this straightaway, or at least have defined and
documented behavior, else we risk rehashing the issues with marking a vCPU as
preempted when it's loaded, but not running.  Which causes problems for live
migration as it results in KVM marking the steal-time page as dirty after vCPUs
have been paused.

[*] https://lkml.kernel.org/r/20240503181734.1467938-4-dmatlack%40google.com
Suleiman Souhlal Aug. 14, 2024, 4:50 a.m. UTC | #6
On Wed, Aug 14, 2024 at 2:06 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jul 30, 2024, Suleiman Souhlal wrote:
> > On Tue, Jul 30, 2024 at 10:26:30AM +0800, Chao Gao wrote:
> > > Hi,
> > >
> > > On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote:
> > > >When the host resumes from a suspend, the guest thinks any task
> > > >that was running during the suspend ran for a long time, even though
> > > >the effective run time was much shorter, which can end up having
> > > >negative effects with scheduling. This can be particularly noticeable
> > > >if the guest task was RT, as it can end up getting throttled for a
> > > >long time.
> > > >
> > > >To mitigate this issue, we include the time that the host was
> > > >suspended in steal time, which lets the guest can subtract the
> > > >duration from the tasks' runtime.
> > > >
> > > >Signed-off-by: Suleiman Souhlal <suleiman@google.com>
> > > >---
> > > > arch/x86/kvm/x86.c       | 23 ++++++++++++++++++++++-
> > > > include/linux/kvm_host.h |  4 ++++
> > > > 2 files changed, 26 insertions(+), 1 deletion(-)
> > > >
> > > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > >index 0763a0f72a067f..94bbdeef843863 100644
> > > >--- a/arch/x86/kvm/x86.c
> > > >+++ b/arch/x86/kvm/x86.c
> > > >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > > >   struct kvm_steal_time __user *st;
> > > >   struct kvm_memslots *slots;
> > > >   gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> > > >-  u64 steal;
> > > >+  u64 steal, suspend_duration;
> > > >   u32 version;
> > > >
> > > >   if (kvm_xen_msr_enabled(vcpu->kvm)) {
> > > >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > > >                   return;
> > > >   }
> > > >
> > > >+  suspend_duration = 0;
> > > >+  if (READ_ONCE(vcpu->suspended)) {
> > > >+          suspend_duration = vcpu->kvm->last_suspend_duration;
> > > >+          vcpu->suspended = 0;
> > >
> > > Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used
> > > for clearing vcpu->suspended?
> >
> > Because this part of the code is essentially single-threaded, it didn't seem
> > like WRITE_ONCE() was needed. I can add it in an eventual future version of
> > the patch if it makes things less confusing (if this code still exists).
>
> {READ,WRITE}_ONCE() don't provide ordering, they only ensure that the compiler
> emits a load/store.  But (a) forcing the compiler to emit a load/store is only
> necessary when it's possible for the compiler to know/prove that the load/store
> won't affect functionalty, and (b) this code doesn't have have the correct
> ordering even if there were the appropriate smp_wmb() and smp_rmb() annotations.
>
> The writer needs to ensure the write to last_suspend_duration is visible before
> vcpu->suspended is set, and the reader needs to ensure it reads last_suspend_duration
> after vcpu->suspended.
>
> That said, it's likely a moot point, because I assume the PM notifier runs while
> tasks are frozen, i.e. its writes are guaranteed to be ordered before
> record_steal_time().
>
> And _that_ said, I don't see any reason for two fields, nor do I see any reason
> to track the suspended duration in the VM instead of the vCPU.  Similar to what
> Chao pointed out, per-VM tracking falls apart if only some vCPUs consume the
> suspend duration.  I.e. just accumulate the suspend duration directly in the
> vCPU.

v2 will do this.

>
> > > >+  }
> > > >+
> > > >   st = (struct kvm_steal_time __user *)ghc->hva;
> > > >   /*
> > > >    * Doing a TLB flush here, on the guest's behalf, can avoid
> > > >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> > > >   unsafe_get_user(steal, &st->steal, out);
> > > >   steal += current->sched_info.run_delay -
> > > >           vcpu->arch.st.last_steal;
> > > >+  steal += suspend_duration;
> > > >   vcpu->arch.st.last_steal = current->sched_info.run_delay;
> > > >   unsafe_put_user(steal, &st->steal, out);
> > > >
> > > >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> > > >
> > > >   mutex_lock(&kvm->lock);
> > > >   kvm_for_each_vcpu(i, vcpu, kvm) {
> > > >+          WRITE_ONCE(vcpu->suspended, 1);
> > > >           if (!vcpu->arch.pv_time.active)
> > > >                   continue;
> > > >
> > > >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> > > >   }
> > > >   mutex_unlock(&kvm->lock);
> > > >
> > > >+  kvm->suspended_time = ktime_get_boottime_ns();
> > > >+
> > > >   return ret ? NOTIFY_BAD : NOTIFY_DONE;
> > > > }
> > > >
> > > >+static int
> > > >+kvm_arch_resume_notifier(struct kvm *kvm)
>
> Do not wrap before the function name.  Linus has a nice explanation/rant on this[*].
>
> [*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@mail.gmail.com
>
> > > >+{
> > > >+  kvm->last_suspend_duration = ktime_get_boottime_ns() -
> > > >+      kvm->suspended_time;
> > >
> > > Is it possible that a vCPU doesn't get any chance to run (i.e., update steal
> > > time) between two suspends? In this case, only the second suspend would be
> > > recorded.
> >
> > Good point. I'll address this.
> >
> > >
> > > Maybe we need an infrastructure in the PM subsystem to record accumulated
> > > suspended time. When updating steal time, KVM can add the additional suspended
> > > time since the last update into steal_time (as how KVM deals with
> > > current->sched_info.run_deley). This way, the scenario I mentioned above won't
> > > be a problem and KVM needn't calculate the suspend duration for each guest. And
> > > this approach can potentially benefit RISC-V and ARM as well, since they have
> > > the same logic as x86 regarding steal_time.
> >
> > Thanks for the suggestion.
> > I'm a bit wary of making a whole PM subsystem addition for such a counter, but
> > maybe I can make a architecture-independent KVM change for it, with a PM
> > notifier in kvm_main.c.
>
> Yes.  Either that, or the fields need to be in kvm_vcpu_arch, not kvm_vcpu.
>
> > > Additionally, it seems that if a guest migrates to another system after a
> > > suspend and before updating steal time, the suspended time is lost during
> > > migration. I'm not sure if this is a practical issue.
> >
> > The systems where the host suspends don't usually do VM migrations. Or at
> > least the ones where we're encountering the problem this patch is trying to
> > address don't (laptops).
> >
> > But even if they did, it doesn't seem that likely that the migration would
> > happen over a host suspend.
>
> I think we want to account for this straightaway, or at least have defined and
> documented behavior, else we risk rehashing the issues with marking a vCPU as
> preempted when it's loaded, but not running.  Which causes problems for live
> migration as it results in KVM marking the steal-time page as dirty after vCPUs
> have been paused.
>
> [*] https://lkml.kernel.org/r/20240503181734.1467938-4-dmatlack%40google.com

Can you explain how the steal-time page could get marked as dirty after VCPUs
have been paused?
From what I can tell, record_steal_time() gets called from vcpu_enter_guest(),
which shouldn't happen when the VCPU has been paused, but I have to admit I
don't really know anything about how live migration works.
The series you linked is addressing an issue when the steal-time page
gets written
to outside of record_steal_time(), but we aren't doing this for this
proposed patch.

With the proposed approach, the steal time page would get copied to the new host
and everything would keep working correctly, with the exception of a
possible host
suspend happening between when the migration started and when it finishes, not
being reflected post-migration.
That seems like a reasonable compromise.

Please let me know if you want more.
I am planning on sending v2 within the next few days.

Thanks,
-- Suleiman
Sean Christopherson Aug. 14, 2024, 3:35 p.m. UTC | #7
On Wed, Aug 14, 2024, Suleiman Souhlal wrote:
> On Wed, Aug 14, 2024 at 2:06 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > Additionally, it seems that if a guest migrates to another system after a
> > > > suspend and before updating steal time, the suspended time is lost during
> > > > migration. I'm not sure if this is a practical issue.
> > >
> > > The systems where the host suspends don't usually do VM migrations. Or at
> > > least the ones where we're encountering the problem this patch is trying to
> > > address don't (laptops).
> > >
> > > But even if they did, it doesn't seem that likely that the migration would
> > > happen over a host suspend.
> >
> > I think we want to account for this straightaway, or at least have defined and
> > documented behavior, else we risk rehashing the issues with marking a vCPU as
> > preempted when it's loaded, but not running.  Which causes problems for live
> > migration as it results in KVM marking the steal-time page as dirty after vCPUs
> > have been paused.
> >
> > [*] https://lkml.kernel.org/r/20240503181734.1467938-4-dmatlack%40google.com
> 
> Can you explain how the steal-time page could get marked as dirty after VCPUs
> have been paused? From what I can tell, record_steal_time() gets called from
> vcpu_enter_guest(), which shouldn't happen when the VCPU has been paused, but
> I have to admit I don't really know anything about how live migration works.

It's not record_steal_time(), it's kvm_steal_time_set_preempted().  The flag
KVM uses to tell the guest that the vCPU has been scheduled out, KVM_VCPU_PREEMPTED,
resides in the kvm_steal_time structure, i.e. in the steal-time page.

Userspace "pauses" vCPUs when it enters blackout to complete live migration.  After
pausing vCPUs, the VMM invokes various KVM_GET_* ioctls to retrieve vCPU state
so that it can be transfered to the destination.  Without the above series, KVM
marks vCPUs as preempted when the associated task is scheduled out and the vCPU
is "loaded", even if the vCPU is not actively running.  This results in KVM writing
to kvm_steal_time.preempted and dirtying the page, after userspace thinks it
should be impossible for KVM to dirty guest memory (because vCPUs are no longer
being run).

> The series you linked is addressing an issue when the steal-time page gets
> written to outside of record_steal_time(), but we aren't doing this for this
> proposed patch.

I know.  What I am saying is that I don't want to punt on the issue Chao raised,
because _if_ we want to properly account suspend time when a vCPU is migrated
(or saved/restored for any reason) without doing KVM_RUN after suspect, then that
either requires updating the steal-time information outside of KVM_RUN, or it
requires new uAPI to explicitly migrate the unaccounted suspend timd.

Given that new uAPI is generally avoided when possible, that makes updating
steal-time outside of KVM_RUN the default choice (which isn,t necessarily the
best choice), which in turn means KVM now has to worry about the above scenario
of writing to guest memory after vCPUs have been paused by userespace.

> With the proposed approach, the steal time page would get copied to the new
> host and everything would keep working correctly, with the exception of a
> possible host suspend happening between when the migration started and when
> it finishes, not being reflected post-migration.  That seems like a
> reasonable compromise.

Maybe, but I'm not keen on sweeping this under the rug.  Ignoring issues because
they'll "never" happen has bitten KVM more than once.

At the absolute bare minimum, the flaw needs to be documented, with a suggested
workaround provided (do KVM on all vCPUs before migrating after suspend), e.g.
so that userspace can workaround the issue in the unlikely scenario userspace
does suspend+resume, saves/restores a VM, *and* cares about steal-time.

Even better would be if we can figure out a way to effectively require KVM_RUN
after suspend+resume, but I can't think of a way to do that without breaking
userspace or adding new uAPI, and adding new uAPI for this feels like overkill.
Suleiman Souhlal Aug. 15, 2024, 4:33 a.m. UTC | #8
On Thu, Aug 15, 2024 at 12:35 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Aug 14, 2024, Suleiman Souhlal wrote:
>
> > With the proposed approach, the steal time page would get copied to the new
> > host and everything would keep working correctly, with the exception of a
> > possible host suspend happening between when the migration started and when
> > it finishes, not being reflected post-migration.  That seems like a
> > reasonable compromise.
>
> Maybe, but I'm not keen on sweeping this under the rug.  Ignoring issues because
> they'll "never" happen has bitten KVM more than once.
>
> At the absolute bare minimum, the flaw needs to be documented, with a suggested
> workaround provided (do KVM on all vCPUs before migrating after suspend), e.g.
> so that userspace can workaround the issue in the unlikely scenario userspace
> does suspend+resume, saves/restores a VM, *and* cares about steal-time.

I can write a comment in record_steal_time() that describes the
scenario, mention it in
the commit message and add something to the steal time part of
Documentation/virt/kvm/x86/msr.rst.

-- Suleiman
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0763a0f72a067f..94bbdeef843863 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3669,7 +3669,7 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 	struct kvm_steal_time __user *st;
 	struct kvm_memslots *slots;
 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
-	u64 steal;
+	u64 steal, suspend_duration;
 	u32 version;
 
 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
@@ -3696,6 +3696,12 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 			return;
 	}
 
+	suspend_duration = 0;
+	if (READ_ONCE(vcpu->suspended)) {
+		suspend_duration = vcpu->kvm->last_suspend_duration;
+		vcpu->suspended = 0;
+	}
+
 	st = (struct kvm_steal_time __user *)ghc->hva;
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
@@ -3749,6 +3755,7 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 	unsafe_get_user(steal, &st->steal, out);
 	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
+	steal += suspend_duration;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 	unsafe_put_user(steal, &st->steal, out);
 
@@ -6920,6 +6927,7 @@  static int kvm_arch_suspend_notifier(struct kvm *kvm)
 
 	mutex_lock(&kvm->lock);
 	kvm_for_each_vcpu(i, vcpu, kvm) {
+		WRITE_ONCE(vcpu->suspended, 1);
 		if (!vcpu->arch.pv_time.active)
 			continue;
 
@@ -6932,15 +6940,28 @@  static int kvm_arch_suspend_notifier(struct kvm *kvm)
 	}
 	mutex_unlock(&kvm->lock);
 
+	kvm->suspended_time = ktime_get_boottime_ns();
+
 	return ret ? NOTIFY_BAD : NOTIFY_DONE;
 }
 
+static int
+kvm_arch_resume_notifier(struct kvm *kvm)
+{
+	kvm->last_suspend_duration = ktime_get_boottime_ns() -
+	    kvm->suspended_time;
+	return NOTIFY_DONE;
+}
+
 int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
 {
 	switch (state) {
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
 		return kvm_arch_suspend_notifier(kvm);
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		return kvm_arch_resume_notifier(kvm);
 	}
 
 	return NOTIFY_DONE;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 692c01e41a18ef..2d37af9a348648 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -366,6 +366,8 @@  struct kvm_vcpu {
 	} async_pf;
 #endif
 
+	bool suspended;
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 	/*
 	 * Cpu relax intercept or pause loop exit optimization
@@ -840,6 +842,8 @@  struct kvm {
 	struct xarray mem_attr_array;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
+	u64 last_suspend_duration;
+	u64 suspended_time;
 };
 
 #define kvm_err(fmt, ...) \