diff mbox

[v2,3/7] KVM-HV: KVM Steal time implementation

Message ID 1308262856-5779-4-git-send-email-glommer@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Glauber Costa June 16, 2011, 10:20 p.m. UTC
To implement steal time, we need the hypervisor to pass the guest information
about how much time was spent running other processes outside the VM.
This is per-vcpu, and using the kvmclock structure for that is an abuse
we decided not to make.

In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
holds the memory area address containing information about steal time

This patch contains the hypervisor part for it. I am keeping it separate from
the headers to facilitate backports to people who wants to backport the kernel
part but not the hypervisor, or the other way around.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: Rik van Riel <riel@redhat.com>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Avi Kivity <avi@redhat.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
CC: Eric B Munson <emunson@mgebm.net>
---
 arch/x86/include/asm/kvm_host.h |    8 +++++
 arch/x86/include/asm/kvm_para.h |    4 ++
 arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 69 insertions(+), 3 deletions(-)

Comments

Eric B Munson June 17, 2011, 12:48 a.m. UTC | #1
On Thu, 16 Jun 2011, Glauber Costa wrote:

> To implement steal time, we need the hypervisor to pass the guest information
> about how much time was spent running other processes outside the VM.
> This is per-vcpu, and using the kvmclock structure for that is an abuse
> we decided not to make.
> 
> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> holds the memory area address containing information about steal time
> 
> This patch contains the hypervisor part for it. I am keeping it separate from
> the headers to facilitate backports to people who wants to backport the kernel
> part but not the hypervisor, or the other way around.
> 
> Signed-off-by: Glauber Costa <glommer@redhat.com>
> CC: Rik van Riel <riel@redhat.com>
> CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Avi Kivity <avi@redhat.com>
> CC: Anthony Liguori <aliguori@us.ibm.com>
> CC: Eric B Munson <emunson@mgebm.net>

Tested-by: Eric B Munson <emunson@mgebm.net>
Avi Kivity June 19, 2011, 9:57 a.m. UTC | #2
On 06/17/2011 01:20 AM, Glauber Costa wrote:
> To implement steal time, we need the hypervisor to pass the guest information
> about how much time was spent running other processes outside the VM.
> This is per-vcpu, and using the kvmclock structure for that is an abuse
> we decided not to make.
>
> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> holds the memory area address containing information about steal time
>
> This patch contains the hypervisor part for it. I am keeping it separate from
> the headers to facilitate backports to people who wants to backport the kernel
> part but not the hypervisor, or the other way around.
>
>
>
> +#define KVM_STEAL_ALIGNMENT_BITS 5
> +#define KVM_STEAL_VALID_BITS ((-1ULL<<  (KVM_STEAL_ALIGNMENT_BITS + 1)))
> +#define KVM_STEAL_RESERVED_MASK (((1<<  KVM_STEAL_ALIGNMENT_BITS) - 1 )<<  1)

Clumsy, but okay.

> +static void record_steal_time(struct kvm_vcpu *vcpu)
> +{
> +	u64 delta;
> +
> +	if (vcpu->arch.st.stime&&  vcpu->arch.st.this_time_out) {

0 is a valid value for stime.

> +
> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> +
> +			vcpu->arch.st.stime = 0;
> +			return;
> +		}
> +
> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> +
> +		vcpu->arch.st.steal.steal += delta;
> +		vcpu->arch.st.steal.version += 2;
> +
> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> +
> +			vcpu->arch.st.stime = 0;
> +			return;
> +		}
> +	}
> +
> +}
> +
>
> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>   			kvm_migrate_timers(vcpu);
>   		vcpu->cpu = cpu;
>   	}
> +
> +	record_steal_time(vcpu);
>   }

This records time spent in userspace in the vcpu thread as steal time.  
Is this what we want?  Or just time preempted away?
Glauber Costa June 20, 2011, 2:53 a.m. UTC | #3
On 06/19/2011 06:57 AM, Avi Kivity wrote:
> On 06/17/2011 01:20 AM, Glauber Costa wrote:
>> To implement steal time, we need the hypervisor to pass the guest
>> information
>> about how much time was spent running other processes outside the VM.
>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>> we decided not to make.
>>
>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>> holds the memory area address containing information about steal time
>>
>> This patch contains the hypervisor part for it. I am keeping it
>> separate from
>> the headers to facilitate backports to people who wants to backport
>> the kernel
>> part but not the hypervisor, or the other way around.
>>
>>
>>
>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>> +#define KVM_STEAL_VALID_BITS ((-1ULL<< (KVM_STEAL_ALIGNMENT_BITS + 1)))
>> +#define KVM_STEAL_RESERVED_MASK (((1<< KVM_STEAL_ALIGNMENT_BITS) - 1
>> )<< 1)
>
> Clumsy, but okay.
>
>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>> +{
>> + u64 delta;
>> +
>> + if (vcpu->arch.st.stime&& vcpu->arch.st.this_time_out) {
>
> 0 is a valid value for stime.

how exactly? stime is a guest physical address...


>> +
>> + if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>> + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>> +
>> + vcpu->arch.st.stime = 0;
>> + return;
>> + }
>> +
>> + delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>> +
>> + vcpu->arch.st.steal.steal += delta;
>> + vcpu->arch.st.steal.version += 2;
>> +
>> + if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
>> + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>> +
>> + vcpu->arch.st.stime = 0;
>> + return;
>> + }
>> + }
>> +
>> +}
>> +
>>
>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu,
>> int cpu)
>> kvm_migrate_timers(vcpu);
>> vcpu->cpu = cpu;
>> }
>> +
>> + record_steal_time(vcpu);
>> }
>
> This records time spent in userspace in the vcpu thread as steal time.
> Is this what we want? Or just time preempted away?

There are arguments either way.

Right now, the way it is, it does account our iothread as steal time, 
which is not 100 % accurate if we think steal time as "whatever takes 
time away from our VM". I tend to think it as "whatever takes time away 
from this CPU", which includes other cpus in the same VM. So thinking 
this way, in a 1-1 phys-to-virt cpu mapping, if the iothread is taking 
80 % cpu for whatever reason, we have 80 % steal time the cpu that is 
sharing the physical cpu with the iothread.

Maybe we could account that as iotime ?
Questions like that are one of the reasons behind me leaving extra 
fields in the steal time structure. We could do a more fine grained 
accounting and differentiate between the multiple entities that can do
work (of various kinds) in our behalf.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 20, 2011, 6:02 a.m. UTC | #4
On 06/20/2011 05:53 AM, Glauber Costa wrote:
>
>>
>>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>>> +{
>>> + u64 delta;
>>> +
>>> + if (vcpu->arch.st.stime&& vcpu->arch.st.this_time_out) {
>>
>> 0 is a valid value for stime.
>
>
> how exactly? stime is a guest physical address...

0 is a valid physical address.

>>>
>>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu,
>>> int cpu)
>>> kvm_migrate_timers(vcpu);
>>> vcpu->cpu = cpu;
>>> }
>>> +
>>> + record_steal_time(vcpu);
>>> }
>>
>> This records time spent in userspace in the vcpu thread as steal time.
>> Is this what we want? Or just time preempted away?
>
> There are arguments either way.
>
> Right now, the way it is, it does account our iothread as steal time, 
> which is not 100 % accurate if we think steal time as "whatever takes 
> time away from our VM". I tend to think it as "whatever takes time 
> away from this CPU", which includes other cpus in the same VM. So 
> thinking this way, in a 1-1 phys-to-virt cpu mapping, if the iothread 
> is taking 80 % cpu for whatever reason, we have 80 % steal time the 
> cpu that is sharing the physical cpu with the iothread.

I'm not talking about the iothread, rather the vcpu thread while running 
in userspace.

>
> Maybe we could account that as iotime ?
> Questions like that are one of the reasons behind me leaving extra 
> fields in the steal time structure. We could do a more fine grained 
> accounting and differentiate between the multiple entities that can do
> work (of various kinds) in our behalf.
>

What do other architectures do (xen, s390)?
Marcelo Tosatti June 20, 2011, 8:56 p.m. UTC | #5
On Sun, Jun 19, 2011 at 12:57:53PM +0300, Avi Kivity wrote:
> On 06/17/2011 01:20 AM, Glauber Costa wrote:
> >To implement steal time, we need the hypervisor to pass the guest information
> >about how much time was spent running other processes outside the VM.
> >This is per-vcpu, and using the kvmclock structure for that is an abuse
> >we decided not to make.
> >
> >In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> >holds the memory area address containing information about steal time
> >
> >This patch contains the hypervisor part for it. I am keeping it separate from
> >the headers to facilitate backports to people who wants to backport the kernel
> >part but not the hypervisor, or the other way around.
> >
> >
> >
> >+#define KVM_STEAL_ALIGNMENT_BITS 5
> >+#define KVM_STEAL_VALID_BITS ((-1ULL<<  (KVM_STEAL_ALIGNMENT_BITS + 1)))
> >+#define KVM_STEAL_RESERVED_MASK (((1<<  KVM_STEAL_ALIGNMENT_BITS) - 1 )<<  1)
> 
> Clumsy, but okay.
> 
> >+static void record_steal_time(struct kvm_vcpu *vcpu)
> >+{
> >+	u64 delta;
> >+
> >+	if (vcpu->arch.st.stime&&  vcpu->arch.st.this_time_out) {
> 
> 0 is a valid value for stime.
> 
> >+
> >+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> >+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >+
> >+			vcpu->arch.st.stime = 0;
> >+			return;
> >+		}
> >+
> >+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> >+
> >+		vcpu->arch.st.steal.steal += delta;
> >+		vcpu->arch.st.steal.version += 2;
> >+
> >+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> >+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >+
> >+			vcpu->arch.st.stime = 0;
> >+			return;
> >+		}
> >+	}
> >+
> >+}
> >+
> >
> >@@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >  			kvm_migrate_timers(vcpu);
> >  		vcpu->cpu = cpu;
> >  	}
> >+
> >+	record_steal_time(vcpu);
> >  }
> 
> This records time spent in userspace in the vcpu thread as steal
> time.  Is this what we want?  Or just time preempted away?

It also accounts halt time (kvm_vcpu_block) as steal time. Glauber, you
could instead use the "runnable-state-but-waiting-in-runqueue" field of
SCHEDSTATS, i forgot the exact name.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa June 28, 2011, 12:30 p.m. UTC | #6
On 06/20/2011 05:56 PM, Marcelo Tosatti wrote:
> On Sun, Jun 19, 2011 at 12:57:53PM +0300, Avi Kivity wrote:
>> On 06/17/2011 01:20 AM, Glauber Costa wrote:
>>> To implement steal time, we need the hypervisor to pass the guest information
>>> about how much time was spent running other processes outside the VM.
>>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>>> we decided not to make.
>>>
>>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>>> holds the memory area address containing information about steal time
>>>
>>> This patch contains the hypervisor part for it. I am keeping it separate from
>>> the headers to facilitate backports to people who wants to backport the kernel
>>> part but not the hypervisor, or the other way around.
>>>
>>>
>>>
>>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>>> +#define KVM_STEAL_VALID_BITS ((-1ULL<<   (KVM_STEAL_ALIGNMENT_BITS + 1)))
>>> +#define KVM_STEAL_RESERVED_MASK (((1<<   KVM_STEAL_ALIGNMENT_BITS) - 1 )<<   1)
>>
>> Clumsy, but okay.
>>
>>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>>> +{
>>> +	u64 delta;
>>> +
>>> +	if (vcpu->arch.st.stime&&   vcpu->arch.st.this_time_out) {
>>
>> 0 is a valid value for stime.
>>
>>> +
>>> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>> +
>>> +			vcpu->arch.st.stime = 0;
>>> +			return;
>>> +		}
>>> +
>>> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>>> +
>>> +		vcpu->arch.st.steal.steal += delta;
>>> +		vcpu->arch.st.steal.version += 2;
>>> +
>>> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>> +
>>> +			vcpu->arch.st.stime = 0;
>>> +			return;
>>> +		}
>>> +	}
>>> +
>>> +}
>>> +
>>>
>>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>   			kvm_migrate_timers(vcpu);
>>>   		vcpu->cpu = cpu;
>>>   	}
>>> +
>>> +	record_steal_time(vcpu);
>>>   }
>>
>> This records time spent in userspace in the vcpu thread as steal
>> time.  Is this what we want?  Or just time preempted away?
>
> It also accounts halt time (kvm_vcpu_block) as steal time. Glauber, you
> could instead use the "runnable-state-but-waiting-in-runqueue" field of
> SCHEDSTATS, i forgot the exact name.
>
I thought about it in the past. I let the idea aside because I didn't 
want to introduce a dependency on SCHEDSTATS. But thinking about it 
again now (and after some days of experimentations with it), I think we 
could have both.

use run_delay (the field you were thinking of) when schedstats are 
available, and fallback to an estimate method like the one we're doing 
when it is not.

Objections ?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti June 28, 2011, 6:02 p.m. UTC | #7
On Tue, Jun 28, 2011 at 09:30:29AM -0300, Glauber Costa wrote:
> On 06/20/2011 05:56 PM, Marcelo Tosatti wrote:
> >On Sun, Jun 19, 2011 at 12:57:53PM +0300, Avi Kivity wrote:
> >>On 06/17/2011 01:20 AM, Glauber Costa wrote:
> >>>To implement steal time, we need the hypervisor to pass the guest information
> >>>about how much time was spent running other processes outside the VM.
> >>>This is per-vcpu, and using the kvmclock structure for that is an abuse
> >>>we decided not to make.
> >>>
> >>>In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> >>>holds the memory area address containing information about steal time
> >>>
> >>>This patch contains the hypervisor part for it. I am keeping it separate from
> >>>the headers to facilitate backports to people who wants to backport the kernel
> >>>part but not the hypervisor, or the other way around.
> >>>
> >>>
> >>>
> >>>+#define KVM_STEAL_ALIGNMENT_BITS 5
> >>>+#define KVM_STEAL_VALID_BITS ((-1ULL<<   (KVM_STEAL_ALIGNMENT_BITS + 1)))
> >>>+#define KVM_STEAL_RESERVED_MASK (((1<<   KVM_STEAL_ALIGNMENT_BITS) - 1 )<<   1)
> >>
> >>Clumsy, but okay.
> >>
> >>>+static void record_steal_time(struct kvm_vcpu *vcpu)
> >>>+{
> >>>+	u64 delta;
> >>>+
> >>>+	if (vcpu->arch.st.stime&&   vcpu->arch.st.this_time_out) {
> >>
> >>0 is a valid value for stime.
> >>
> >>>+
> >>>+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>>+
> >>>+			vcpu->arch.st.stime = 0;
> >>>+			return;
> >>>+		}
> >>>+
> >>>+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> >>>+
> >>>+		vcpu->arch.st.steal.steal += delta;
> >>>+		vcpu->arch.st.steal.version += 2;
> >>>+
> >>>+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>>+
> >>>+			vcpu->arch.st.stime = 0;
> >>>+			return;
> >>>+		}
> >>>+	}
> >>>+
> >>>+}
> >>>+
> >>>
> >>>@@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>>  			kvm_migrate_timers(vcpu);
> >>>  		vcpu->cpu = cpu;
> >>>  	}
> >>>+
> >>>+	record_steal_time(vcpu);
> >>>  }
> >>
> >>This records time spent in userspace in the vcpu thread as steal
> >>time.  Is this what we want?  Or just time preempted away?
> >
> >It also accounts halt time (kvm_vcpu_block) as steal time. Glauber, you
> >could instead use the "runnable-state-but-waiting-in-runqueue" field of
> >SCHEDSTATS, i forgot the exact name.
> >
> I thought about it in the past. I let the idea aside because I
> didn't want to introduce a dependency on SCHEDSTATS. But thinking
> about it again now (and after some days of experimentations with
> it), I think we could have both.
> 
> use run_delay (the field you were thinking of) when schedstats are
> available, and fallback to an estimate method like the one we're
> doing when it is not.
> 
> Objections ?

I'm okay with that.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc38eca..5dce014 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -388,6 +388,14 @@  struct kvm_vcpu_arch {
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	struct {
+		u64 msr_val;
+		gpa_t stime;
+		struct kvm_steal_time steal;
+		u64 this_time_out;
+	} st;
+
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ac306c4..0341e61 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -45,6 +45,10 @@  struct kvm_steal_time {
 	__u32 pad[6];
 };
 
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6645634..10fe028 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -797,12 +797,12 @@  EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	8
+#define KVM_SAVE_MSRS_BEGIN	9
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1480,6 +1480,34 @@  static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+	u64 delta;
+
+	if (vcpu->arch.st.stime && vcpu->arch.st.this_time_out) {
+
+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
+
+			vcpu->arch.st.stime = 0;
+			return;
+		}
+
+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
+
+		vcpu->arch.st.steal.steal += delta;
+		vcpu->arch.st.steal.version += 2;
+
+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
+
+			vcpu->arch.st.stime = 0;
+			return;
+		}
+	}
+
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1562,6 +1590,23 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		vcpu->arch.st.msr_val = data;
+
+		if (!(data & KVM_MSR_ENABLED)) {
+			vcpu->arch.st.stime = 0;
+			break;
+		}
+
+		if (data & KVM_STEAL_RESERVED_MASK)
+			return 1;
+
+		vcpu->arch.st.this_time_out = get_kernel_ns();
+		vcpu->arch.st.stime = data & KVM_STEAL_VALID_BITS;
+		record_steal_time(vcpu);
+
+		break;
+
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1847,6 +1892,9 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		data = vcpu->arch.st.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2158,6 +2206,8 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
+
+	record_steal_time(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2165,6 +2215,7 @@  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+	vcpu->arch.st.this_time_out = get_kernel_ns();
 }
 
 static int is_efer_nx(void)
@@ -2477,7 +2528,8 @@  static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+			     (1 << KVM_FEATURE_STEAL_TIME);
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -6200,6 +6252,8 @@  int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvmclock_reset(vcpu);
 
+	vcpu->arch.st.stime = 0;
+
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;