diff mbox

[3/7] KVM-HV: KVM Steal time implementation

Message ID 1308007897-17013-4-git-send-email-glommer@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Glauber Costa June 13, 2011, 11:31 p.m. UTC
To implement steal time, we need the hypervisor to pass the guest information
about how much time was spent running other processes outside the VM.
This is per-vcpu, and using the kvmclock structure for that is an abuse
we decided not to make.

In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
holds the memory area address containing information about steal time

This patch contains the hypervisor part for it. I am keeping it separate from
the headers to facilitate backports to people who wants to backport the kernel
part but not the hypervisor, or the other way around.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: Rik van Riel <riel@redhat.com>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Avi Kivity <avi@redhat.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
CC: Eric B Munson <emunson@mgebm.net>
---
 arch/x86/include/asm/kvm_host.h |    8 +++++
 arch/x86/include/asm/kvm_para.h |    4 ++
 arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 69 insertions(+), 3 deletions(-)

Comments

Eric B Munson June 14, 2011, 1:20 a.m. UTC | #1
On Mon, 13 Jun 2011, Glauber Costa wrote:

> To implement steal time, we need the hypervisor to pass the guest information
> about how much time was spent running other processes outside the VM.
> This is per-vcpu, and using the kvmclock structure for that is an abuse
> we decided not to make.
> 
> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> holds the memory area address containing information about steal time
> 
> This patch contains the hypervisor part for it. I am keeping it separate from
> the headers to facilitate backports to people who wants to backport the kernel
> part but not the hypervisor, or the other way around.
> 
> Signed-off-by: Glauber Costa <glommer@redhat.com>
> CC: Rik van Riel <riel@redhat.com>
> CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Avi Kivity <avi@redhat.com>
> CC: Anthony Liguori <aliguori@us.ibm.com>
> CC: Eric B Munson <emunson@mgebm.net>

Tested-by: Eric B Munson <emunson@mgebm.net>
Gleb Natapov June 14, 2011, 7:45 a.m. UTC | #2
On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
> To implement steal time, we need the hypervisor to pass the guest information
> about how much time was spent running other processes outside the VM.
> This is per-vcpu, and using the kvmclock structure for that is an abuse
> we decided not to make.
> 
> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> holds the memory area address containing information about steal time
> 
> This patch contains the hypervisor part for it. I am keeping it separate from
> the headers to facilitate backports to people who wants to backport the kernel
> part but not the hypervisor, or the other way around.
> 
> Signed-off-by: Glauber Costa <glommer@redhat.com>
> CC: Rik van Riel <riel@redhat.com>
> CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Avi Kivity <avi@redhat.com>
> CC: Anthony Liguori <aliguori@us.ibm.com>
> CC: Eric B Munson <emunson@mgebm.net>
> ---
>  arch/x86/include/asm/kvm_host.h |    8 +++++
>  arch/x86/include/asm/kvm_para.h |    4 ++
>  arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
>  3 files changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fc38eca..5dce014 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
>  	unsigned int hw_tsc_khz;
>  	unsigned int time_offset;
>  	struct page *time_page;
> +
> +	struct {
> +		u64 msr_val;
> +		gpa_t stime;
> +		struct kvm_steal_time steal;
> +		u64 this_time_out;
> +	} st;
> +
>  	u64 last_guest_tsc;
>  	u64 last_kernel_ns;
>  	u64 last_tsc_nsec;
> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> index ac306c4..0341e61 100644
> --- a/arch/x86/include/asm/kvm_para.h
> +++ b/arch/x86/include/asm/kvm_para.h
> @@ -45,6 +45,10 @@ struct kvm_steal_time {
>  	__u32 pad[6];
>  };
>  
> +#define KVM_STEAL_ALIGNMENT_BITS 5
> +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
> +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
> +
>  #define KVM_MAX_MMU_OP_BATCH           32
>  
>  #define KVM_ASYNC_PF_ENABLED			(1 << 0)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6645634..10fe028 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
>   * kvm-specific. Those are put in the beginning of the list.
>   */
>  
> -#define KVM_SAVE_MSRS_BEGIN	8
> +#define KVM_SAVE_MSRS_BEGIN	9
>  static u32 msrs_to_save[] = {
>  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
>  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
>  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
> -	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
> +	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
>  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>  	MSR_STAR,
>  #ifdef CONFIG_X86_64
> @@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> +static void record_steal_time(struct kvm_vcpu *vcpu)
> +{
> +	u64 delta;
> +
> +	if (vcpu->arch.st.stime && vcpu->arch.st.this_time_out) {
> +
> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> +
> +			vcpu->arch.st.stime = 0;
> +			return;
> +		}
> +
> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> +
> +		vcpu->arch.st.steal.steal += delta;
> +		vcpu->arch.st.steal.version += 2;
> +
> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
for the read above?

> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> +
> +			vcpu->arch.st.stime = 0;
> +			return;
> +		}
> +	}
> +
> +}
> +
>  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  {
>  	switch (msr) {
> @@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  		if (kvm_pv_enable_async_pf(vcpu, data))
>  			return 1;
>  		break;
> +	case MSR_KVM_STEAL_TIME:
> +		vcpu->arch.st.msr_val = data;
> +
> +		if (!(data & KVM_MSR_ENABLED)) {
> +			vcpu->arch.st.stime = 0;
> +			break;
> +		}
> +
> +		if (data & KVM_STEAL_RESERVED_MASK)
> +			return 1;
> +
> +		vcpu->arch.st.this_time_out = get_kernel_ns();
> +		vcpu->arch.st.stime = data & KVM_STEAL_VALID_BITS;
> +		record_steal_time(vcpu);
> +
> +		break;
> +
>  	case MSR_IA32_MCG_CTL:
>  	case MSR_IA32_MCG_STATUS:
>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
> @@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>  	case MSR_KVM_ASYNC_PF_EN:
>  		data = vcpu->arch.apf.msr_val;
>  		break;
> +	case MSR_KVM_STEAL_TIME:
> +		data = vcpu->arch.st.msr_val;
> +		break;
>  	case MSR_IA32_P5_MC_ADDR:
>  	case MSR_IA32_P5_MC_TYPE:
>  	case MSR_IA32_MCG_CAP:
> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  			kvm_migrate_timers(vcpu);
>  		vcpu->cpu = cpu;
>  	}
> +
> +	record_steal_time(vcpu);
>  }
>  
>  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> @@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->vcpu_put(vcpu);
>  	kvm_put_guest_fpu(vcpu);
>  	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
> +	vcpu->arch.st.this_time_out = get_kernel_ns();
>  }
>  
Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
just before/after entering/exiting a guest? vcpu_(put|get) are called
for each vcpu ioctl, not only VCPU_RUN.

>  static int is_efer_nx(void)
> @@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>  			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
>  			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
>  			     (1 << KVM_FEATURE_ASYNC_PF) |
> -			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> +			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> +			     (1 << KVM_FEATURE_STEAL_TIME);
>  		entry->ebx = 0;
>  		entry->ecx = 0;
>  		entry->edx = 0;
> @@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
>  
>  	kvmclock_reset(vcpu);
>  
> +	vcpu->arch.st.stime = 0;
> +
>  	kvm_clear_async_pf_completion_queue(vcpu);
>  	kvm_async_pf_hash_reset(vcpu);
>  	vcpu->arch.apf.halted = false;
> -- 
> 1.7.3.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel June 14, 2011, 12:20 p.m. UTC | #3
On 06/13/2011 07:31 PM, Glauber Costa wrote:
> To implement steal time, we need the hypervisor to pass the guest information
> about how much time was spent running other processes outside the VM.
> This is per-vcpu, and using the kvmclock structure for that is an abuse
> we decided not to make.
>
> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> holds the memory area address containing information about steal time
>
> This patch contains the hypervisor part for it. I am keeping it separate from
> the headers to facilitate backports to people who wants to backport the kernel
> part but not the hypervisor, or the other way around.
>
> Signed-off-by: Glauber Costa<glommer@redhat.com>
> CC: Rik van Riel<riel@redhat.com>
> CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
> CC: Peter Zijlstra<peterz@infradead.org>
> CC: Avi Kivity<avi@redhat.com>
> CC: Anthony Liguori<aliguori@us.ibm.com>
> CC: Eric B Munson<emunson@mgebm.net>

Acked-by: Rik van Riel <riel@redhat.com>
Glauber Costa June 15, 2011, 1:01 a.m. UTC | #4
On 06/14/2011 04:45 AM, Gleb Natapov wrote:
> On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
>> To implement steal time, we need the hypervisor to pass the guest information
>> about how much time was spent running other processes outside the VM.
>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>> we decided not to make.
>>
>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>> holds the memory area address containing information about steal time
>>
>> This patch contains the hypervisor part for it. I am keeping it separate from
>> the headers to facilitate backports to people who wants to backport the kernel
>> part but not the hypervisor, or the other way around.
>>
>> Signed-off-by: Glauber Costa<glommer@redhat.com>
>> CC: Rik van Riel<riel@redhat.com>
>> CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
>> CC: Peter Zijlstra<peterz@infradead.org>
>> CC: Avi Kivity<avi@redhat.com>
>> CC: Anthony Liguori<aliguori@us.ibm.com>
>> CC: Eric B Munson<emunson@mgebm.net>
>> ---
>>   arch/x86/include/asm/kvm_host.h |    8 +++++
>>   arch/x86/include/asm/kvm_para.h |    4 ++
>>   arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
>>   3 files changed, 69 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index fc38eca..5dce014 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
>>   	unsigned int hw_tsc_khz;
>>   	unsigned int time_offset;
>>   	struct page *time_page;
>> +
>> +	struct {
>> +		u64 msr_val;
>> +		gpa_t stime;
>> +		struct kvm_steal_time steal;
>> +		u64 this_time_out;
>> +	} st;
>> +
>>   	u64 last_guest_tsc;
>>   	u64 last_kernel_ns;
>>   	u64 last_tsc_nsec;
>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>> index ac306c4..0341e61 100644
>> --- a/arch/x86/include/asm/kvm_para.h
>> +++ b/arch/x86/include/asm/kvm_para.h
>> @@ -45,6 +45,10 @@ struct kvm_steal_time {
>>   	__u32 pad[6];
>>   };
>>
>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>> +#define KVM_STEAL_VALID_BITS ((-1ULL<<  (KVM_STEAL_ALIGNMENT_BITS + 1)))
>> +#define KVM_STEAL_RESERVED_MASK (((1<<  KVM_STEAL_ALIGNMENT_BITS) - 1 )<<  1)
>> +
>>   #define KVM_MAX_MMU_OP_BATCH           32
>>
>>   #define KVM_ASYNC_PF_ENABLED			(1<<  0)
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6645634..10fe028 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
>>    * kvm-specific. Those are put in the beginning of the list.
>>    */
>>
>> -#define KVM_SAVE_MSRS_BEGIN	8
>> +#define KVM_SAVE_MSRS_BEGIN	9
>>   static u32 msrs_to_save[] = {
>>   	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
>>   	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
>>   	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
>> -	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
>> +	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
>>   	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>>   	MSR_STAR,
>>   #ifdef CONFIG_X86_64
>> @@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
>>   	}
>>   }
>>
>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>> +{
>> +	u64 delta;
>> +
>> +	if (vcpu->arch.st.stime&&  vcpu->arch.st.this_time_out) {
>> +
>> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>> +
>> +			vcpu->arch.st.stime = 0;
>> +			return;
>> +		}
>> +
>> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>> +
>> +		vcpu->arch.st.steal.steal += delta;
>> +		vcpu->arch.st.steal.version += 2;
>> +
>> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
> for the read above?

Good idea, I can do that.

>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>> +
>> +			vcpu->arch.st.stime = 0;
>> +			return;
>> +		}
>> +	}
>> +
>> +}
>> +
>>   int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>   {
>>   	switch (msr) {
>> @@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>   		if (kvm_pv_enable_async_pf(vcpu, data))
>>   			return 1;
>>   		break;
>> +	case MSR_KVM_STEAL_TIME:
>> +		vcpu->arch.st.msr_val = data;
>> +
>> +		if (!(data&  KVM_MSR_ENABLED)) {
>> +			vcpu->arch.st.stime = 0;
>> +			break;
>> +		}
>> +
>> +		if (data&  KVM_STEAL_RESERVED_MASK)
>> +			return 1;
>> +
>> +		vcpu->arch.st.this_time_out = get_kernel_ns();
>> +		vcpu->arch.st.stime = data&  KVM_STEAL_VALID_BITS;
>> +		record_steal_time(vcpu);
>> +
>> +		break;
>> +
>>   	case MSR_IA32_MCG_CTL:
>>   	case MSR_IA32_MCG_STATUS:
>>   	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
>> @@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>>   	case MSR_KVM_ASYNC_PF_EN:
>>   		data = vcpu->arch.apf.msr_val;
>>   		break;
>> +	case MSR_KVM_STEAL_TIME:
>> +		data = vcpu->arch.st.msr_val;
>> +		break;
>>   	case MSR_IA32_P5_MC_ADDR:
>>   	case MSR_IA32_P5_MC_TYPE:
>>   	case MSR_IA32_MCG_CAP:
>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>   			kvm_migrate_timers(vcpu);
>>   		vcpu->cpu = cpu;
>>   	}
>> +
>> +	record_steal_time(vcpu);
>>   }
>>
>>   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>> @@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>   	kvm_x86_ops->vcpu_put(vcpu);
>>   	kvm_put_guest_fpu(vcpu);
>>   	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
>> +	vcpu->arch.st.this_time_out = get_kernel_ns();
>>   }
>>
> Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
> just before/after entering/exiting a guest? vcpu_(put|get) are called
> for each vcpu ioctl, not only VCPU_RUN.
>
>>   static int is_efer_nx(void)
>> @@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>   			     (1<<  KVM_FEATURE_NOP_IO_DELAY) |
>>   			     (1<<  KVM_FEATURE_CLOCKSOURCE2) |
>>   			     (1<<  KVM_FEATURE_ASYNC_PF) |
>> -			     (1<<  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
>> +			     (1<<  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
>> +			     (1<<  KVM_FEATURE_STEAL_TIME);
>>   		entry->ebx = 0;
>>   		entry->ecx = 0;
>>   		entry->edx = 0;
>> @@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
>>
>>   	kvmclock_reset(vcpu);
>>
>> +	vcpu->arch.st.stime = 0;
>> +
>>   	kvm_clear_async_pf_completion_queue(vcpu);
>>   	kvm_async_pf_hash_reset(vcpu);
>>   	vcpu->arch.apf.halted = false;
>> --
>> 1.7.3.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> --
> 			Gleb.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa June 15, 2011, 3:09 a.m. UTC | #5
On 06/14/2011 04:45 AM, Gleb Natapov wrote:
> On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
>> To implement steal time, we need the hypervisor to pass the guest information
>> about how much time was spent running other processes outside the VM.
>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>> we decided not to make.
>>
>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>> holds the memory area address containing information about steal time
>>
>> This patch contains the hypervisor part for it. I am keeping it separate from
>> the headers to facilitate backports to people who wants to backport the kernel
>> part but not the hypervisor, or the other way around.
>>
>> Signed-off-by: Glauber Costa<glommer@redhat.com>
>> CC: Rik van Riel<riel@redhat.com>
>> CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
>> CC: Peter Zijlstra<peterz@infradead.org>
>> CC: Avi Kivity<avi@redhat.com>
>> CC: Anthony Liguori<aliguori@us.ibm.com>
>> CC: Eric B Munson<emunson@mgebm.net>
>> ---
>>   arch/x86/include/asm/kvm_host.h |    8 +++++
>>   arch/x86/include/asm/kvm_para.h |    4 ++
>>   arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
>>   3 files changed, 69 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index fc38eca..5dce014 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
>>   	unsigned int hw_tsc_khz;
>>   	unsigned int time_offset;
>>   	struct page *time_page;
>> +
>> +	struct {
>> +		u64 msr_val;
>> +		gpa_t stime;
>> +		struct kvm_steal_time steal;
>> +		u64 this_time_out;
>> +	} st;
>> +
>>   	u64 last_guest_tsc;
>>   	u64 last_kernel_ns;
>>   	u64 last_tsc_nsec;
>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>> index ac306c4..0341e61 100644
>> --- a/arch/x86/include/asm/kvm_para.h
>> +++ b/arch/x86/include/asm/kvm_para.h
>> @@ -45,6 +45,10 @@ struct kvm_steal_time {
>>   	__u32 pad[6];
>>   };
>>
>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>> +#define KVM_STEAL_VALID_BITS ((-1ULL<<  (KVM_STEAL_ALIGNMENT_BITS + 1)))
>> +#define KVM_STEAL_RESERVED_MASK (((1<<  KVM_STEAL_ALIGNMENT_BITS) - 1 )<<  1)
>> +
>>   #define KVM_MAX_MMU_OP_BATCH           32
>>
>>   #define KVM_ASYNC_PF_ENABLED			(1<<  0)
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6645634..10fe028 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
>>    * kvm-specific. Those are put in the beginning of the list.
>>    */
>>
>> -#define KVM_SAVE_MSRS_BEGIN	8
>> +#define KVM_SAVE_MSRS_BEGIN	9
>>   static u32 msrs_to_save[] = {
>>   	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
>>   	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
>>   	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
>> -	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
>> +	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
>>   	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>>   	MSR_STAR,
>>   #ifdef CONFIG_X86_64
>> @@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
>>   	}
>>   }
>>
>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>> +{
>> +	u64 delta;
>> +
>> +	if (vcpu->arch.st.stime&&  vcpu->arch.st.this_time_out) {
>> +
>> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>> +
>> +			vcpu->arch.st.stime = 0;
>> +			return;
>> +		}
>> +
>> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>> +
>> +		vcpu->arch.st.steal.steal += delta;
>> +		vcpu->arch.st.steal.version += 2;
>> +
>> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
> for the read above?

Actually, I'd expect most read/writes to benefit from caching, no?
So why don't we just rename kvm_write_guest_cached() to 
kvm_write_guest(), and the few places - if any - that need to force 
transversing of the gfn mappings, get renamed to kvm_write_guest_uncached ?

>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>> +
>> +			vcpu->arch.st.stime = 0;
>> +			return;
>> +		}
>> +	}
>> +
>> +}
>> +
>>   int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>   {
>>   	switch (msr) {
>> @@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>   		if (kvm_pv_enable_async_pf(vcpu, data))
>>   			return 1;
>>   		break;
>> +	case MSR_KVM_STEAL_TIME:
>> +		vcpu->arch.st.msr_val = data;
>> +
>> +		if (!(data&  KVM_MSR_ENABLED)) {
>> +			vcpu->arch.st.stime = 0;
>> +			break;
>> +		}
>> +
>> +		if (data&  KVM_STEAL_RESERVED_MASK)
>> +			return 1;
>> +
>> +		vcpu->arch.st.this_time_out = get_kernel_ns();
>> +		vcpu->arch.st.stime = data&  KVM_STEAL_VALID_BITS;
>> +		record_steal_time(vcpu);
>> +
>> +		break;
>> +
>>   	case MSR_IA32_MCG_CTL:
>>   	case MSR_IA32_MCG_STATUS:
>>   	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
>> @@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>>   	case MSR_KVM_ASYNC_PF_EN:
>>   		data = vcpu->arch.apf.msr_val;
>>   		break;
>> +	case MSR_KVM_STEAL_TIME:
>> +		data = vcpu->arch.st.msr_val;
>> +		break;
>>   	case MSR_IA32_P5_MC_ADDR:
>>   	case MSR_IA32_P5_MC_TYPE:
>>   	case MSR_IA32_MCG_CAP:
>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>   			kvm_migrate_timers(vcpu);
>>   		vcpu->cpu = cpu;
>>   	}
>> +
>> +	record_steal_time(vcpu);
>>   }
>>
>>   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>> @@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>   	kvm_x86_ops->vcpu_put(vcpu);
>>   	kvm_put_guest_fpu(vcpu);
>>   	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
>> +	vcpu->arch.st.this_time_out = get_kernel_ns();
>>   }
>>
> Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
> just before/after entering/exiting a guest? vcpu_(put|get) are called
> for each vcpu ioctl, not only VCPU_RUN.
Sorry, missed that the first time I've read your e-mail.

If done like you said, time spent on the hypervisor is accounted as 
steal time. I don't think it is.

Steal time is time spent running someone else's job instead of yours. 
The name for the time spent in the hypervisor doing something for *you* 
is just overhead.

>
>>   static int is_efer_nx(void)
>> @@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>   			     (1<<  KVM_FEATURE_NOP_IO_DELAY) |
>>   			     (1<<  KVM_FEATURE_CLOCKSOURCE2) |
>>   			     (1<<  KVM_FEATURE_ASYNC_PF) |
>> -			     (1<<  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
>> +			     (1<<  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
>> +			     (1<<  KVM_FEATURE_STEAL_TIME);
>>   		entry->ebx = 0;
>>   		entry->ecx = 0;
>>   		entry->edx = 0;
>> @@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
>>
>>   	kvmclock_reset(vcpu);
>>
>> +	vcpu->arch.st.stime = 0;
>> +
>>   	kvm_clear_async_pf_completion_queue(vcpu);
>>   	kvm_async_pf_hash_reset(vcpu);
>>   	vcpu->arch.apf.halted = false;
>> --
>> 1.7.3.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> --
> 			Gleb.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov June 15, 2011, 9:09 a.m. UTC | #6
On Wed, Jun 15, 2011 at 12:09:31AM -0300, Glauber Costa wrote:
> On 06/14/2011 04:45 AM, Gleb Natapov wrote:
> >On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
> >>To implement steal time, we need the hypervisor to pass the guest information
> >>about how much time was spent running other processes outside the VM.
> >>This is per-vcpu, and using the kvmclock structure for that is an abuse
> >>we decided not to make.
> >>
> >>In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> >>holds the memory area address containing information about steal time
> >>
> >>This patch contains the hypervisor part for it. I am keeping it separate from
> >>the headers to facilitate backports to people who wants to backport the kernel
> >>part but not the hypervisor, or the other way around.
> >>
> >>Signed-off-by: Glauber Costa<glommer@redhat.com>
> >>CC: Rik van Riel<riel@redhat.com>
> >>CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
> >>CC: Peter Zijlstra<peterz@infradead.org>
> >>CC: Avi Kivity<avi@redhat.com>
> >>CC: Anthony Liguori<aliguori@us.ibm.com>
> >>CC: Eric B Munson<emunson@mgebm.net>
> >>---
> >>  arch/x86/include/asm/kvm_host.h |    8 +++++
> >>  arch/x86/include/asm/kvm_para.h |    4 ++
> >>  arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
> >>  3 files changed, 69 insertions(+), 3 deletions(-)
> >>
> >>diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>index fc38eca..5dce014 100644
> >>--- a/arch/x86/include/asm/kvm_host.h
> >>+++ b/arch/x86/include/asm/kvm_host.h
> >>@@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
> >>  	unsigned int hw_tsc_khz;
> >>  	unsigned int time_offset;
> >>  	struct page *time_page;
> >>+
> >>+	struct {
> >>+		u64 msr_val;
> >>+		gpa_t stime;
> >>+		struct kvm_steal_time steal;
> >>+		u64 this_time_out;
> >>+	} st;
> >>+
> >>  	u64 last_guest_tsc;
> >>  	u64 last_kernel_ns;
> >>  	u64 last_tsc_nsec;
> >>diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> >>index ac306c4..0341e61 100644
> >>--- a/arch/x86/include/asm/kvm_para.h
> >>+++ b/arch/x86/include/asm/kvm_para.h
> >>@@ -45,6 +45,10 @@ struct kvm_steal_time {
> >>  	__u32 pad[6];
> >>  };
> >>
> >>+#define KVM_STEAL_ALIGNMENT_BITS 5
> >>+#define KVM_STEAL_VALID_BITS ((-1ULL<<  (KVM_STEAL_ALIGNMENT_BITS + 1)))
> >>+#define KVM_STEAL_RESERVED_MASK (((1<<  KVM_STEAL_ALIGNMENT_BITS) - 1 )<<  1)
> >>+
> >>  #define KVM_MAX_MMU_OP_BATCH           32
> >>
> >>  #define KVM_ASYNC_PF_ENABLED			(1<<  0)
> >>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>index 6645634..10fe028 100644
> >>--- a/arch/x86/kvm/x86.c
> >>+++ b/arch/x86/kvm/x86.c
> >>@@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
> >>   * kvm-specific. Those are put in the beginning of the list.
> >>   */
> >>
> >>-#define KVM_SAVE_MSRS_BEGIN	8
> >>+#define KVM_SAVE_MSRS_BEGIN	9
> >>  static u32 msrs_to_save[] = {
> >>  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
> >>  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
> >>  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
> >>-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
> >>+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
> >>  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
> >>  	MSR_STAR,
> >>  #ifdef CONFIG_X86_64
> >>@@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
> >>  	}
> >>  }
> >>
> >>+static void record_steal_time(struct kvm_vcpu *vcpu)
> >>+{
> >>+	u64 delta;
> >>+
> >>+	if (vcpu->arch.st.stime&&  vcpu->arch.st.this_time_out) {
> >>+
> >>+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>+
> >>+			vcpu->arch.st.stime = 0;
> >>+			return;
> >>+		}
> >>+
> >>+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> >>+
> >>+		vcpu->arch.st.steal.steal += delta;
> >>+		vcpu->arch.st.steal.version += 2;
> >>+
> >>+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> >Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
> >for the read above?
> 
> Actually, I'd expect most read/writes to benefit from caching, no?
> So why don't we just rename kvm_write_guest_cached() to
> kvm_write_guest(), and the few places - if any - that need to force
> transversing of the gfn mappings, get renamed to
> kvm_write_guest_uncached ?
> 
Good idea. I do not see any places where kvm_write_guest_uncached is
needed from a brief look. Avi?

> >>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>+
> >>+			vcpu->arch.st.stime = 0;
> >>+			return;
> >>+		}
> >>+	}
> >>+
> >>+}
> >>+
> >>  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >>  {
> >>  	switch (msr) {
> >>@@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >>  		if (kvm_pv_enable_async_pf(vcpu, data))
> >>  			return 1;
> >>  		break;
> >>+	case MSR_KVM_STEAL_TIME:
> >>+		vcpu->arch.st.msr_val = data;
> >>+
> >>+		if (!(data&  KVM_MSR_ENABLED)) {
> >>+			vcpu->arch.st.stime = 0;
> >>+			break;
> >>+		}
> >>+
> >>+		if (data&  KVM_STEAL_RESERVED_MASK)
> >>+			return 1;
> >>+
> >>+		vcpu->arch.st.this_time_out = get_kernel_ns();
> >>+		vcpu->arch.st.stime = data&  KVM_STEAL_VALID_BITS;
> >>+		record_steal_time(vcpu);
> >>+
> >>+		break;
> >>+
> >>  	case MSR_IA32_MCG_CTL:
> >>  	case MSR_IA32_MCG_STATUS:
> >>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
> >>@@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> >>  	case MSR_KVM_ASYNC_PF_EN:
> >>  		data = vcpu->arch.apf.msr_val;
> >>  		break;
> >>+	case MSR_KVM_STEAL_TIME:
> >>+		data = vcpu->arch.st.msr_val;
> >>+		break;
> >>  	case MSR_IA32_P5_MC_ADDR:
> >>  	case MSR_IA32_P5_MC_TYPE:
> >>  	case MSR_IA32_MCG_CAP:
> >>@@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>  			kvm_migrate_timers(vcpu);
> >>  		vcpu->cpu = cpu;
> >>  	}
> >>+
> >>+	record_steal_time(vcpu);
> >>  }
> >>
> >>  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>@@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>  	kvm_x86_ops->vcpu_put(vcpu);
> >>  	kvm_put_guest_fpu(vcpu);
> >>  	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
> >>+	vcpu->arch.st.this_time_out = get_kernel_ns();
> >>  }
> >>
> >Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
> >just before/after entering/exiting a guest? vcpu_(put|get) are called
> >for each vcpu ioctl, not only VCPU_RUN.
> Sorry, missed that the first time I've read your e-mail.
> 
> If done like you said, time spent on the hypervisor is accounted as
> steal time. I don't think it is.
I thought that this is the point of a steal time. Running other
tasks/guests is a hypervisor overhead too after all :) Also what about
time spend serving host interrupts between put/get? It will not be
accounted as steal time, correct?

> 
> Steal time is time spent running someone else's job instead of
> yours. The name for the time spent in the hypervisor doing something
> for *you* is just overhead.
OK. That is the question of a definition I guess. If you define it like
that the code is correct.

> 
> >
> >>  static int is_efer_nx(void)
> >>@@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >>  			     (1<<  KVM_FEATURE_NOP_IO_DELAY) |
> >>  			     (1<<  KVM_FEATURE_CLOCKSOURCE2) |
> >>  			     (1<<  KVM_FEATURE_ASYNC_PF) |
> >>-			     (1<<  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> >>+			     (1<<  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> >>+			     (1<<  KVM_FEATURE_STEAL_TIME);
> >>  		entry->ebx = 0;
> >>  		entry->ecx = 0;
> >>  		entry->edx = 0;
> >>@@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
> >>
> >>  	kvmclock_reset(vcpu);
> >>
> >>+	vcpu->arch.st.stime = 0;
> >>+
> >>  	kvm_clear_async_pf_completion_queue(vcpu);
> >>  	kvm_async_pf_hash_reset(vcpu);
> >>  	vcpu->arch.apf.halted = false;
> >>--
> >>1.7.3.4
> >>
> >>--
> >>To unsubscribe from this list: send the line "unsubscribe kvm" in
> >>the body of a message to majordomo@vger.kernel.org
> >>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> >--
> >			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa June 16, 2011, 3:31 a.m. UTC | #7
On 06/15/2011 06:09 AM, Gleb Natapov wrote:
> On Wed, Jun 15, 2011 at 12:09:31AM -0300, Glauber Costa wrote:
>> On 06/14/2011 04:45 AM, Gleb Natapov wrote:
>>> On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
>>>> To implement steal time, we need the hypervisor to pass the guest information
>>>> about how much time was spent running other processes outside the VM.
>>>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>>>> we decided not to make.
>>>>
>>>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>>>> holds the memory area address containing information about steal time
>>>>
>>>> This patch contains the hypervisor part for it. I am keeping it separate from
>>>> the headers to facilitate backports to people who wants to backport the kernel
>>>> part but not the hypervisor, or the other way around.
>>>>
>>>> Signed-off-by: Glauber Costa<glommer@redhat.com>
>>>> CC: Rik van Riel<riel@redhat.com>
>>>> CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
>>>> CC: Peter Zijlstra<peterz@infradead.org>
>>>> CC: Avi Kivity<avi@redhat.com>
>>>> CC: Anthony Liguori<aliguori@us.ibm.com>
>>>> CC: Eric B Munson<emunson@mgebm.net>
>>>> ---
>>>>   arch/x86/include/asm/kvm_host.h |    8 +++++
>>>>   arch/x86/include/asm/kvm_para.h |    4 ++
>>>>   arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
>>>>   3 files changed, 69 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index fc38eca..5dce014 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
>>>>   	unsigned int hw_tsc_khz;
>>>>   	unsigned int time_offset;
>>>>   	struct page *time_page;
>>>> +
>>>> +	struct {
>>>> +		u64 msr_val;
>>>> +		gpa_t stime;
>>>> +		struct kvm_steal_time steal;
>>>> +		u64 this_time_out;
>>>> +	} st;
>>>> +
>>>>   	u64 last_guest_tsc;
>>>>   	u64 last_kernel_ns;
>>>>   	u64 last_tsc_nsec;
>>>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>>>> index ac306c4..0341e61 100644
>>>> --- a/arch/x86/include/asm/kvm_para.h
>>>> +++ b/arch/x86/include/asm/kvm_para.h
>>>> @@ -45,6 +45,10 @@ struct kvm_steal_time {
>>>>   	__u32 pad[6];
>>>>   };
>>>>
>>>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>>>> +#define KVM_STEAL_VALID_BITS ((-1ULL<<   (KVM_STEAL_ALIGNMENT_BITS + 1)))
>>>> +#define KVM_STEAL_RESERVED_MASK (((1<<   KVM_STEAL_ALIGNMENT_BITS) - 1 )<<   1)
>>>> +
>>>>   #define KVM_MAX_MMU_OP_BATCH           32
>>>>
>>>>   #define KVM_ASYNC_PF_ENABLED			(1<<   0)
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index 6645634..10fe028 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
>>>>    * kvm-specific. Those are put in the beginning of the list.
>>>>    */
>>>>
>>>> -#define KVM_SAVE_MSRS_BEGIN	8
>>>> +#define KVM_SAVE_MSRS_BEGIN	9
>>>>   static u32 msrs_to_save[] = {
>>>>   	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
>>>>   	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
>>>>   	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
>>>> -	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
>>>> +	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
>>>>   	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>>>>   	MSR_STAR,
>>>>   #ifdef CONFIG_X86_64
>>>> @@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
>>>>   	}
>>>>   }
>>>>
>>>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	u64 delta;
>>>> +
>>>> +	if (vcpu->arch.st.stime&&   vcpu->arch.st.this_time_out) {
>>>> +
>>>> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>>> +
>>>> +			vcpu->arch.st.stime = 0;
>>>> +			return;
>>>> +		}
>>>> +
>>>> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>>>> +
>>>> +		vcpu->arch.st.steal.steal += delta;
>>>> +		vcpu->arch.st.steal.version += 2;
>>>> +
>>>> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
>>> Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
>>> for the read above?
>>
>> Actually, I'd expect most read/writes to benefit from caching, no?
>> So why don't we just rename kvm_write_guest_cached() to
>> kvm_write_guest(), and the few places - if any - that need to force
>> transversing of the gfn mappings, get renamed to
>> kvm_write_guest_uncached ?
>>
> Good idea. I do not see any places where kvm_write_guest_uncached is
> needed from a brief look. Avi?
>
>>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>>> +
>>>> +			vcpu->arch.st.stime = 0;
>>>> +			return;
>>>> +		}
>>>> +	}
>>>> +
>>>> +}
>>>> +
>>>>   int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>>>   {
>>>>   	switch (msr) {
>>>> @@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>>>   		if (kvm_pv_enable_async_pf(vcpu, data))
>>>>   			return 1;
>>>>   		break;
>>>> +	case MSR_KVM_STEAL_TIME:
>>>> +		vcpu->arch.st.msr_val = data;
>>>> +
>>>> +		if (!(data&   KVM_MSR_ENABLED)) {
>>>> +			vcpu->arch.st.stime = 0;
>>>> +			break;
>>>> +		}
>>>> +
>>>> +		if (data&   KVM_STEAL_RESERVED_MASK)
>>>> +			return 1;
>>>> +
>>>> +		vcpu->arch.st.this_time_out = get_kernel_ns();
>>>> +		vcpu->arch.st.stime = data&   KVM_STEAL_VALID_BITS;
>>>> +		record_steal_time(vcpu);
>>>> +
>>>> +		break;
>>>> +
>>>>   	case MSR_IA32_MCG_CTL:
>>>>   	case MSR_IA32_MCG_STATUS:
>>>>   	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
>>>> @@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>>>>   	case MSR_KVM_ASYNC_PF_EN:
>>>>   		data = vcpu->arch.apf.msr_val;
>>>>   		break;
>>>> +	case MSR_KVM_STEAL_TIME:
>>>> +		data = vcpu->arch.st.msr_val;
>>>> +		break;
>>>>   	case MSR_IA32_P5_MC_ADDR:
>>>>   	case MSR_IA32_P5_MC_TYPE:
>>>>   	case MSR_IA32_MCG_CAP:
>>>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>>   			kvm_migrate_timers(vcpu);
>>>>   		vcpu->cpu = cpu;
>>>>   	}
>>>> +
>>>> +	record_steal_time(vcpu);
>>>>   }
>>>>
>>>>   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>> @@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>>   	kvm_x86_ops->vcpu_put(vcpu);
>>>>   	kvm_put_guest_fpu(vcpu);
>>>>   	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
>>>> +	vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>   }
>>>>
>>> Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
>>> just before/after entering/exiting a guest? vcpu_(put|get) are called
>>> for each vcpu ioctl, not only VCPU_RUN.
>> Sorry, missed that the first time I've read your e-mail.
>>
>> If done like you said, time spent on the hypervisor is accounted as
>> steal time. I don't think it is.
> I thought that this is the point of a steal time. Running other
> tasks/guests is a hypervisor overhead too after all :) Also what about
> time spend serving host interrupts between put/get? It will not be
> accounted as steal time, correct?

This is mostly semantics. I like to compare this to a normal process: 
There is a difference between time the OS spent on your behalf, doing 
your system calls (sys), and time spent by other processes. Similar 
thing here.

Which put/get are you referring to specifically ? You mean vcpu_put() vs 
vcpu_load() ?

If they are after vcpu_put(), they will, because at this time your 
process is officially out of the cpu.


>>
>> Steal time is time spent running someone else's job instead of
>> yours. The name for the time spent in the hypervisor doing something
>> for *you* is just overhead.
> OK. That is the question of a definition I guess. If you define it like
> that the code is correct.
>
>>
>>>
>>>>   static int is_efer_nx(void)
>>>> @@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>>>   			     (1<<   KVM_FEATURE_NOP_IO_DELAY) |
>>>>   			     (1<<   KVM_FEATURE_CLOCKSOURCE2) |
>>>>   			     (1<<   KVM_FEATURE_ASYNC_PF) |
>>>> -			     (1<<   KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
>>>> +			     (1<<   KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
>>>> +			     (1<<   KVM_FEATURE_STEAL_TIME);
>>>>   		entry->ebx = 0;
>>>>   		entry->ecx = 0;
>>>>   		entry->edx = 0;
>>>> @@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
>>>>
>>>>   	kvmclock_reset(vcpu);
>>>>
>>>> +	vcpu->arch.st.stime = 0;
>>>> +
>>>>   	kvm_clear_async_pf_completion_queue(vcpu);
>>>>   	kvm_async_pf_hash_reset(vcpu);
>>>>   	vcpu->arch.apf.halted = false;
>>>> --
>>>> 1.7.3.4
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>> --
>>> 			Gleb.
>
> --
> 			Gleb.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov June 16, 2011, 11:27 a.m. UTC | #8
On Thu, Jun 16, 2011 at 12:31:02AM -0300, Glauber Costa wrote:
> On 06/15/2011 06:09 AM, Gleb Natapov wrote:
> >On Wed, Jun 15, 2011 at 12:09:31AM -0300, Glauber Costa wrote:
> >>On 06/14/2011 04:45 AM, Gleb Natapov wrote:
> >>>On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
> >>>>To implement steal time, we need the hypervisor to pass the guest information
> >>>>about how much time was spent running other processes outside the VM.
> >>>>This is per-vcpu, and using the kvmclock structure for that is an abuse
> >>>>we decided not to make.
> >>>>
> >>>>In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> >>>>holds the memory area address containing information about steal time
> >>>>
> >>>>This patch contains the hypervisor part for it. I am keeping it separate from
> >>>>the headers to facilitate backports to people who wants to backport the kernel
> >>>>part but not the hypervisor, or the other way around.
> >>>>
> >>>>Signed-off-by: Glauber Costa<glommer@redhat.com>
> >>>>CC: Rik van Riel<riel@redhat.com>
> >>>>CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
> >>>>CC: Peter Zijlstra<peterz@infradead.org>
> >>>>CC: Avi Kivity<avi@redhat.com>
> >>>>CC: Anthony Liguori<aliguori@us.ibm.com>
> >>>>CC: Eric B Munson<emunson@mgebm.net>
> >>>>---
> >>>>  arch/x86/include/asm/kvm_host.h |    8 +++++
> >>>>  arch/x86/include/asm/kvm_para.h |    4 ++
> >>>>  arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
> >>>>  3 files changed, 69 insertions(+), 3 deletions(-)
> >>>>
> >>>>diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>>>index fc38eca..5dce014 100644
> >>>>--- a/arch/x86/include/asm/kvm_host.h
> >>>>+++ b/arch/x86/include/asm/kvm_host.h
> >>>>@@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
> >>>>  	unsigned int hw_tsc_khz;
> >>>>  	unsigned int time_offset;
> >>>>  	struct page *time_page;
> >>>>+
> >>>>+	struct {
> >>>>+		u64 msr_val;
> >>>>+		gpa_t stime;
> >>>>+		struct kvm_steal_time steal;
> >>>>+		u64 this_time_out;
> >>>>+	} st;
> >>>>+
> >>>>  	u64 last_guest_tsc;
> >>>>  	u64 last_kernel_ns;
> >>>>  	u64 last_tsc_nsec;
> >>>>diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> >>>>index ac306c4..0341e61 100644
> >>>>--- a/arch/x86/include/asm/kvm_para.h
> >>>>+++ b/arch/x86/include/asm/kvm_para.h
> >>>>@@ -45,6 +45,10 @@ struct kvm_steal_time {
> >>>>  	__u32 pad[6];
> >>>>  };
> >>>>
> >>>>+#define KVM_STEAL_ALIGNMENT_BITS 5
> >>>>+#define KVM_STEAL_VALID_BITS ((-1ULL<<   (KVM_STEAL_ALIGNMENT_BITS + 1)))
> >>>>+#define KVM_STEAL_RESERVED_MASK (((1<<   KVM_STEAL_ALIGNMENT_BITS) - 1 )<<   1)
> >>>>+
> >>>>  #define KVM_MAX_MMU_OP_BATCH           32
> >>>>
> >>>>  #define KVM_ASYNC_PF_ENABLED			(1<<   0)
> >>>>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>>>index 6645634..10fe028 100644
> >>>>--- a/arch/x86/kvm/x86.c
> >>>>+++ b/arch/x86/kvm/x86.c
> >>>>@@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
> >>>>   * kvm-specific. Those are put in the beginning of the list.
> >>>>   */
> >>>>
> >>>>-#define KVM_SAVE_MSRS_BEGIN	8
> >>>>+#define KVM_SAVE_MSRS_BEGIN	9
> >>>>  static u32 msrs_to_save[] = {
> >>>>  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
> >>>>  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
> >>>>  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
> >>>>-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
> >>>>+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
> >>>>  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
> >>>>  	MSR_STAR,
> >>>>  #ifdef CONFIG_X86_64
> >>>>@@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
> >>>>  	}
> >>>>  }
> >>>>
> >>>>+static void record_steal_time(struct kvm_vcpu *vcpu)
> >>>>+{
> >>>>+	u64 delta;
> >>>>+
> >>>>+	if (vcpu->arch.st.stime&&   vcpu->arch.st.this_time_out) {
> >>>>+
> >>>>+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>>>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>>>+
> >>>>+			vcpu->arch.st.stime = 0;
> >>>>+			return;
> >>>>+		}
> >>>>+
> >>>>+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> >>>>+
> >>>>+		vcpu->arch.st.steal.steal += delta;
> >>>>+		vcpu->arch.st.steal.version += 2;
> >>>>+
> >>>>+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>>Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
> >>>for the read above?
> >>
> >>Actually, I'd expect most read/writes to benefit from caching, no?
> >>So why don't we just rename kvm_write_guest_cached() to
> >>kvm_write_guest(), and the few places - if any - that need to force
> >>transversing of the gfn mappings, get renamed to
> >>kvm_write_guest_uncached ?
> >>
> >Good idea. I do not see any places where kvm_write_guest_uncached is
> >needed from a brief look. Avi?
> >
> >>>>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>>>+
> >>>>+			vcpu->arch.st.stime = 0;
> >>>>+			return;
> >>>>+		}
> >>>>+	}
> >>>>+
> >>>>+}
> >>>>+
> >>>>  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >>>>  {
> >>>>  	switch (msr) {
> >>>>@@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >>>>  		if (kvm_pv_enable_async_pf(vcpu, data))
> >>>>  			return 1;
> >>>>  		break;
> >>>>+	case MSR_KVM_STEAL_TIME:
> >>>>+		vcpu->arch.st.msr_val = data;
> >>>>+
> >>>>+		if (!(data&   KVM_MSR_ENABLED)) {
> >>>>+			vcpu->arch.st.stime = 0;
> >>>>+			break;
> >>>>+		}
> >>>>+
> >>>>+		if (data&   KVM_STEAL_RESERVED_MASK)
> >>>>+			return 1;
> >>>>+
> >>>>+		vcpu->arch.st.this_time_out = get_kernel_ns();
> >>>>+		vcpu->arch.st.stime = data&   KVM_STEAL_VALID_BITS;
> >>>>+		record_steal_time(vcpu);
> >>>>+
> >>>>+		break;
> >>>>+
> >>>>  	case MSR_IA32_MCG_CTL:
> >>>>  	case MSR_IA32_MCG_STATUS:
> >>>>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
> >>>>@@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> >>>>  	case MSR_KVM_ASYNC_PF_EN:
> >>>>  		data = vcpu->arch.apf.msr_val;
> >>>>  		break;
> >>>>+	case MSR_KVM_STEAL_TIME:
> >>>>+		data = vcpu->arch.st.msr_val;
> >>>>+		break;
> >>>>  	case MSR_IA32_P5_MC_ADDR:
> >>>>  	case MSR_IA32_P5_MC_TYPE:
> >>>>  	case MSR_IA32_MCG_CAP:
> >>>>@@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>>>  			kvm_migrate_timers(vcpu);
> >>>>  		vcpu->cpu = cpu;
> >>>>  	}
> >>>>+
> >>>>+	record_steal_time(vcpu);
> >>>>  }
> >>>>
> >>>>  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>>>@@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>>>  	kvm_x86_ops->vcpu_put(vcpu);
> >>>>  	kvm_put_guest_fpu(vcpu);
> >>>>  	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
> >>>>+	vcpu->arch.st.this_time_out = get_kernel_ns();
> >>>>  }
> >>>>
> >>>Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
> >>>just before/after entering/exiting a guest? vcpu_(put|get) are called
> >>>for each vcpu ioctl, not only VCPU_RUN.
> >>Sorry, missed that the first time I've read your e-mail.
> >>
> >>If done like you said, time spent on the hypervisor is accounted as
> >>steal time. I don't think it is.
> >I thought that this is the point of a steal time. Running other
> >tasks/guests is a hypervisor overhead too after all :) Also what about
> >time spend serving host interrupts between put/get? It will not be
> >accounted as steal time, correct?
> 
> This is mostly semantics. I like to compare this to a normal
> process: There is a difference between time the OS spent on your
> behalf, doing your system calls (sys), and time spent by other
> processes. Similar thing here.
> 
The problem with this approach is that things like doing "info cpus"
in qemu monitor will change guest scheduling behaviour. Do we want it
to be like that?

> Which put/get are you referring to specifically ? You mean
> vcpu_put() vs vcpu_load() ?
> 
Yes.

> If they are after vcpu_put(), they will, because at this time your
> process is officially out of the cpu.
> 
And if they are between vcpu_load() and vcpu_put() they will be accounted as
a work done on behalf of a guest although they are likely unrelated.

> 
> >>
> >>Steal time is time spent running someone else's job instead of
> >>yours. The name for the time spent in the hypervisor doing something
> >>for *you* is just overhead.
> >OK. That is the question of a definition I guess. If you define it like
> >that the code is correct.
> >
> >>
> >>>
> >>>>  static int is_efer_nx(void)
> >>>>@@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >>>>  			     (1<<   KVM_FEATURE_NOP_IO_DELAY) |
> >>>>  			     (1<<   KVM_FEATURE_CLOCKSOURCE2) |
> >>>>  			     (1<<   KVM_FEATURE_ASYNC_PF) |
> >>>>-			     (1<<   KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> >>>>+			     (1<<   KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> >>>>+			     (1<<   KVM_FEATURE_STEAL_TIME);
> >>>>  		entry->ebx = 0;
> >>>>  		entry->ecx = 0;
> >>>>  		entry->edx = 0;
> >>>>@@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
> >>>>
> >>>>  	kvmclock_reset(vcpu);
> >>>>
> >>>>+	vcpu->arch.st.stime = 0;
> >>>>+
> >>>>  	kvm_clear_async_pf_completion_queue(vcpu);
> >>>>  	kvm_async_pf_hash_reset(vcpu);
> >>>>  	vcpu->arch.apf.halted = false;
> >>>>--
> >>>>1.7.3.4
> >>>>
> >>>>--
> >>>>To unsubscribe from this list: send the line "unsubscribe kvm" in
> >>>>the body of a message to majordomo@vger.kernel.org
> >>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>>--
> >>>			Gleb.
> >
> >--
> >			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa June 16, 2011, 12:11 p.m. UTC | #9
On 06/16/2011 08:27 AM, Gleb Natapov wrote:
> On Thu, Jun 16, 2011 at 12:31:02AM -0300, Glauber Costa wrote:
>> On 06/15/2011 06:09 AM, Gleb Natapov wrote:
>>> On Wed, Jun 15, 2011 at 12:09:31AM -0300, Glauber Costa wrote:
>>>> On 06/14/2011 04:45 AM, Gleb Natapov wrote:
>>>>> On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
>>>>>> To implement steal time, we need the hypervisor to pass the guest information
>>>>>> about how much time was spent running other processes outside the VM.
>>>>>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>>>>>> we decided not to make.
>>>>>>
>>>>>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>>>>>> holds the memory area address containing information about steal time
>>>>>>
>>>>>> This patch contains the hypervisor part for it. I am keeping it separate from
>>>>>> the headers to facilitate backports to people who wants to backport the kernel
>>>>>> part but not the hypervisor, or the other way around.
>>>>>>
>>>>>> Signed-off-by: Glauber Costa<glommer@redhat.com>
>>>>>> CC: Rik van Riel<riel@redhat.com>
>>>>>> CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
>>>>>> CC: Peter Zijlstra<peterz@infradead.org>
>>>>>> CC: Avi Kivity<avi@redhat.com>
>>>>>> CC: Anthony Liguori<aliguori@us.ibm.com>
>>>>>> CC: Eric B Munson<emunson@mgebm.net>
>>>>>> ---
>>>>>>   arch/x86/include/asm/kvm_host.h |    8 +++++
>>>>>>   arch/x86/include/asm/kvm_para.h |    4 ++
>>>>>>   arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
>>>>>>   3 files changed, 69 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>>>> index fc38eca..5dce014 100644
>>>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>>>> @@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
>>>>>>   	unsigned int hw_tsc_khz;
>>>>>>   	unsigned int time_offset;
>>>>>>   	struct page *time_page;
>>>>>> +
>>>>>> +	struct {
>>>>>> +		u64 msr_val;
>>>>>> +		gpa_t stime;
>>>>>> +		struct kvm_steal_time steal;
>>>>>> +		u64 this_time_out;
>>>>>> +	} st;
>>>>>> +
>>>>>>   	u64 last_guest_tsc;
>>>>>>   	u64 last_kernel_ns;
>>>>>>   	u64 last_tsc_nsec;
>>>>>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>>>>>> index ac306c4..0341e61 100644
>>>>>> --- a/arch/x86/include/asm/kvm_para.h
>>>>>> +++ b/arch/x86/include/asm/kvm_para.h
>>>>>> @@ -45,6 +45,10 @@ struct kvm_steal_time {
>>>>>>   	__u32 pad[6];
>>>>>>   };
>>>>>>
>>>>>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>>>>>> +#define KVM_STEAL_VALID_BITS ((-1ULL<<    (KVM_STEAL_ALIGNMENT_BITS + 1)))
>>>>>> +#define KVM_STEAL_RESERVED_MASK (((1<<    KVM_STEAL_ALIGNMENT_BITS) - 1 )<<    1)
>>>>>> +
>>>>>>   #define KVM_MAX_MMU_OP_BATCH           32
>>>>>>
>>>>>>   #define KVM_ASYNC_PF_ENABLED			(1<<    0)
>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>>> index 6645634..10fe028 100644
>>>>>> --- a/arch/x86/kvm/x86.c
>>>>>> +++ b/arch/x86/kvm/x86.c
>>>>>> @@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
>>>>>>    * kvm-specific. Those are put in the beginning of the list.
>>>>>>    */
>>>>>>
>>>>>> -#define KVM_SAVE_MSRS_BEGIN	8
>>>>>> +#define KVM_SAVE_MSRS_BEGIN	9
>>>>>>   static u32 msrs_to_save[] = {
>>>>>>   	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
>>>>>>   	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
>>>>>>   	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
>>>>>> -	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
>>>>>> +	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
>>>>>>   	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>>>>>>   	MSR_STAR,
>>>>>>   #ifdef CONFIG_X86_64
>>>>>> @@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
>>>>>>   	}
>>>>>>   }
>>>>>>
>>>>>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>>>>>> +{
>>>>>> +	u64 delta;
>>>>>> +
>>>>>> +	if (vcpu->arch.st.stime&&    vcpu->arch.st.this_time_out) {
>>>>>> +
>>>>>> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>>>>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>>>>> +
>>>>>> +			vcpu->arch.st.stime = 0;
>>>>>> +			return;
>>>>>> +		}
>>>>>> +
>>>>>> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>>>>>> +
>>>>>> +		vcpu->arch.st.steal.steal += delta;
>>>>>> +		vcpu->arch.st.steal.version += 2;
>>>>>> +
>>>>>> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
>>>>> Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
>>>>> for the read above?
>>>>
>>>> Actually, I'd expect most read/writes to benefit from caching, no?
>>>> So why don't we just rename kvm_write_guest_cached() to
>>>> kvm_write_guest(), and the few places - if any - that need to force
>>>> transversing of the gfn mappings, get renamed to
>>>> kvm_write_guest_uncached ?
>>>>
>>> Good idea. I do not see any places where kvm_write_guest_uncached is
>>> needed from a brief look. Avi?
>>>
>>>>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>>>>> +
>>>>>> +			vcpu->arch.st.stime = 0;
>>>>>> +			return;
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>> +}
>>>>>> +
>>>>>>   int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>>>>>   {
>>>>>>   	switch (msr) {
>>>>>> @@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>>>>>   		if (kvm_pv_enable_async_pf(vcpu, data))
>>>>>>   			return 1;
>>>>>>   		break;
>>>>>> +	case MSR_KVM_STEAL_TIME:
>>>>>> +		vcpu->arch.st.msr_val = data;
>>>>>> +
>>>>>> +		if (!(data&    KVM_MSR_ENABLED)) {
>>>>>> +			vcpu->arch.st.stime = 0;
>>>>>> +			break;
>>>>>> +		}
>>>>>> +
>>>>>> +		if (data&    KVM_STEAL_RESERVED_MASK)
>>>>>> +			return 1;
>>>>>> +
>>>>>> +		vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>>> +		vcpu->arch.st.stime = data&    KVM_STEAL_VALID_BITS;
>>>>>> +		record_steal_time(vcpu);
>>>>>> +
>>>>>> +		break;
>>>>>> +
>>>>>>   	case MSR_IA32_MCG_CTL:
>>>>>>   	case MSR_IA32_MCG_STATUS:
>>>>>>   	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
>>>>>> @@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>>>>>>   	case MSR_KVM_ASYNC_PF_EN:
>>>>>>   		data = vcpu->arch.apf.msr_val;
>>>>>>   		break;
>>>>>> +	case MSR_KVM_STEAL_TIME:
>>>>>> +		data = vcpu->arch.st.msr_val;
>>>>>> +		break;
>>>>>>   	case MSR_IA32_P5_MC_ADDR:
>>>>>>   	case MSR_IA32_P5_MC_TYPE:
>>>>>>   	case MSR_IA32_MCG_CAP:
>>>>>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>>>>   			kvm_migrate_timers(vcpu);
>>>>>>   		vcpu->cpu = cpu;
>>>>>>   	}
>>>>>> +
>>>>>> +	record_steal_time(vcpu);
>>>>>>   }
>>>>>>
>>>>>>   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>>>> @@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>>>>   	kvm_x86_ops->vcpu_put(vcpu);
>>>>>>   	kvm_put_guest_fpu(vcpu);
>>>>>>   	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
>>>>>> +	vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>>>   }
>>>>>>
>>>>> Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>> just before/after entering/exiting a guest? vcpu_(put|get) are called
>>>>> for each vcpu ioctl, not only VCPU_RUN.
>>>> Sorry, missed that the first time I've read your e-mail.
>>>>
>>>> If done like you said, time spent on the hypervisor is accounted as
>>>> steal time. I don't think it is.
>>> I thought that this is the point of a steal time. Running other
>>> tasks/guests is a hypervisor overhead too after all :) Also what about
>>> time spend serving host interrupts between put/get? It will not be
>>> accounted as steal time, correct?
>>
>> This is mostly semantics. I like to compare this to a normal
>> process: There is a difference between time the OS spent on your
>> behalf, doing your system calls (sys), and time spent by other
>> processes. Similar thing here.
>>
> The problem with this approach is that things like doing "info cpus"
> in qemu monitor will change guest scheduling behaviour. Do we want it
> to be like that?

You mean because it is running in a different thread, and will compete 
for resources with the cpu thread ?

>> Which put/get are you referring to specifically ? You mean
>> vcpu_put() vs vcpu_load() ?
>>
> Yes.
>
>> If they are after vcpu_put(), they will, because at this time your
>> process is officially out of the cpu.
>>
> And if they are between vcpu_load() and vcpu_put() they will be accounted as
> a work done on behalf of a guest although they are likely unrelated.
I think the best we can do around it here is record steal time / measure 
time as late as we can. We could in theory subtract irq time, but it 
sounds too complicated for little gain.

>>
>>>>
>>>> Steal time is time spent running someone else's job instead of
>>>> yours. The name for the time spent in the hypervisor doing something
>>>> for *you* is just overhead.
>>> OK. That is the question of a definition I guess. If you define it like
>>> that the code is correct.
>>>
>>>>
>>>>>
>>>>>>   static int is_efer_nx(void)
>>>>>> @@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>>>>>   			     (1<<    KVM_FEATURE_NOP_IO_DELAY) |
>>>>>>   			     (1<<    KVM_FEATURE_CLOCKSOURCE2) |
>>>>>>   			     (1<<    KVM_FEATURE_ASYNC_PF) |
>>>>>> -			     (1<<    KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
>>>>>> +			     (1<<    KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
>>>>>> +			     (1<<    KVM_FEATURE_STEAL_TIME);
>>>>>>   		entry->ebx = 0;
>>>>>>   		entry->ecx = 0;
>>>>>>   		entry->edx = 0;
>>>>>> @@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
>>>>>>
>>>>>>   	kvmclock_reset(vcpu);
>>>>>>
>>>>>> +	vcpu->arch.st.stime = 0;
>>>>>> +
>>>>>>   	kvm_clear_async_pf_completion_queue(vcpu);
>>>>>>   	kvm_async_pf_hash_reset(vcpu);
>>>>>>   	vcpu->arch.apf.halted = false;
>>>>>> --
>>>>>> 1.7.3.4
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>> --
>>>>> 			Gleb.
>>>
>>> --
>>> 			Gleb.
>
> --
> 			Gleb.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov June 16, 2011, 12:21 p.m. UTC | #10
On Thu, Jun 16, 2011 at 09:11:42AM -0300, Glauber Costa wrote:
> On 06/16/2011 08:27 AM, Gleb Natapov wrote:
> >On Thu, Jun 16, 2011 at 12:31:02AM -0300, Glauber Costa wrote:
> >>On 06/15/2011 06:09 AM, Gleb Natapov wrote:
> >>>On Wed, Jun 15, 2011 at 12:09:31AM -0300, Glauber Costa wrote:
> >>>>On 06/14/2011 04:45 AM, Gleb Natapov wrote:
> >>>>>On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
> >>>>>>To implement steal time, we need the hypervisor to pass the guest information
> >>>>>>about how much time was spent running other processes outside the VM.
> >>>>>>This is per-vcpu, and using the kvmclock structure for that is an abuse
> >>>>>>we decided not to make.
> >>>>>>
> >>>>>>In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> >>>>>>holds the memory area address containing information about steal time
> >>>>>>
> >>>>>>This patch contains the hypervisor part for it. I am keeping it separate from
> >>>>>>the headers to facilitate backports to people who wants to backport the kernel
> >>>>>>part but not the hypervisor, or the other way around.
> >>>>>>
> >>>>>>Signed-off-by: Glauber Costa<glommer@redhat.com>
> >>>>>>CC: Rik van Riel<riel@redhat.com>
> >>>>>>CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
> >>>>>>CC: Peter Zijlstra<peterz@infradead.org>
> >>>>>>CC: Avi Kivity<avi@redhat.com>
> >>>>>>CC: Anthony Liguori<aliguori@us.ibm.com>
> >>>>>>CC: Eric B Munson<emunson@mgebm.net>
> >>>>>>---
> >>>>>>  arch/x86/include/asm/kvm_host.h |    8 +++++
> >>>>>>  arch/x86/include/asm/kvm_para.h |    4 ++
> >>>>>>  arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
> >>>>>>  3 files changed, 69 insertions(+), 3 deletions(-)
> >>>>>>
> >>>>>>diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>>>>>index fc38eca..5dce014 100644
> >>>>>>--- a/arch/x86/include/asm/kvm_host.h
> >>>>>>+++ b/arch/x86/include/asm/kvm_host.h
> >>>>>>@@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
> >>>>>>  	unsigned int hw_tsc_khz;
> >>>>>>  	unsigned int time_offset;
> >>>>>>  	struct page *time_page;
> >>>>>>+
> >>>>>>+	struct {
> >>>>>>+		u64 msr_val;
> >>>>>>+		gpa_t stime;
> >>>>>>+		struct kvm_steal_time steal;
> >>>>>>+		u64 this_time_out;
> >>>>>>+	} st;
> >>>>>>+
> >>>>>>  	u64 last_guest_tsc;
> >>>>>>  	u64 last_kernel_ns;
> >>>>>>  	u64 last_tsc_nsec;
> >>>>>>diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> >>>>>>index ac306c4..0341e61 100644
> >>>>>>--- a/arch/x86/include/asm/kvm_para.h
> >>>>>>+++ b/arch/x86/include/asm/kvm_para.h
> >>>>>>@@ -45,6 +45,10 @@ struct kvm_steal_time {
> >>>>>>  	__u32 pad[6];
> >>>>>>  };
> >>>>>>
> >>>>>>+#define KVM_STEAL_ALIGNMENT_BITS 5
> >>>>>>+#define KVM_STEAL_VALID_BITS ((-1ULL<<    (KVM_STEAL_ALIGNMENT_BITS + 1)))
> >>>>>>+#define KVM_STEAL_RESERVED_MASK (((1<<    KVM_STEAL_ALIGNMENT_BITS) - 1 )<<    1)
> >>>>>>+
> >>>>>>  #define KVM_MAX_MMU_OP_BATCH           32
> >>>>>>
> >>>>>>  #define KVM_ASYNC_PF_ENABLED			(1<<    0)
> >>>>>>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>>>>>index 6645634..10fe028 100644
> >>>>>>--- a/arch/x86/kvm/x86.c
> >>>>>>+++ b/arch/x86/kvm/x86.c
> >>>>>>@@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
> >>>>>>   * kvm-specific. Those are put in the beginning of the list.
> >>>>>>   */
> >>>>>>
> >>>>>>-#define KVM_SAVE_MSRS_BEGIN	8
> >>>>>>+#define KVM_SAVE_MSRS_BEGIN	9
> >>>>>>  static u32 msrs_to_save[] = {
> >>>>>>  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
> >>>>>>  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
> >>>>>>  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
> >>>>>>-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
> >>>>>>+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
> >>>>>>  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
> >>>>>>  	MSR_STAR,
> >>>>>>  #ifdef CONFIG_X86_64
> >>>>>>@@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
> >>>>>>  	}
> >>>>>>  }
> >>>>>>
> >>>>>>+static void record_steal_time(struct kvm_vcpu *vcpu)
> >>>>>>+{
> >>>>>>+	u64 delta;
> >>>>>>+
> >>>>>>+	if (vcpu->arch.st.stime&&    vcpu->arch.st.this_time_out) {
> >>>>>>+
> >>>>>>+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>>>>>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>>>>>+
> >>>>>>+			vcpu->arch.st.stime = 0;
> >>>>>>+			return;
> >>>>>>+		}
> >>>>>>+
> >>>>>>+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
> >>>>>>+
> >>>>>>+		vcpu->arch.st.steal.steal += delta;
> >>>>>>+		vcpu->arch.st.steal.version += 2;
> >>>>>>+
> >>>>>>+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
> >>>>>Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
> >>>>>for the read above?
> >>>>
> >>>>Actually, I'd expect most read/writes to benefit from caching, no?
> >>>>So why don't we just rename kvm_write_guest_cached() to
> >>>>kvm_write_guest(), and the few places - if any - that need to force
> >>>>transversing of the gfn mappings, get renamed to
> >>>>kvm_write_guest_uncached ?
> >>>>
> >>>Good idea. I do not see any places where kvm_write_guest_uncached is
> >>>needed from a brief look. Avi?
> >>>
> >>>>>>+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
> >>>>>>+
> >>>>>>+			vcpu->arch.st.stime = 0;
> >>>>>>+			return;
> >>>>>>+		}
> >>>>>>+	}
> >>>>>>+
> >>>>>>+}
> >>>>>>+
> >>>>>>  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >>>>>>  {
> >>>>>>  	switch (msr) {
> >>>>>>@@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >>>>>>  		if (kvm_pv_enable_async_pf(vcpu, data))
> >>>>>>  			return 1;
> >>>>>>  		break;
> >>>>>>+	case MSR_KVM_STEAL_TIME:
> >>>>>>+		vcpu->arch.st.msr_val = data;
> >>>>>>+
> >>>>>>+		if (!(data&    KVM_MSR_ENABLED)) {
> >>>>>>+			vcpu->arch.st.stime = 0;
> >>>>>>+			break;
> >>>>>>+		}
> >>>>>>+
> >>>>>>+		if (data&    KVM_STEAL_RESERVED_MASK)
> >>>>>>+			return 1;
> >>>>>>+
> >>>>>>+		vcpu->arch.st.this_time_out = get_kernel_ns();
> >>>>>>+		vcpu->arch.st.stime = data&    KVM_STEAL_VALID_BITS;
> >>>>>>+		record_steal_time(vcpu);
> >>>>>>+
> >>>>>>+		break;
> >>>>>>+
> >>>>>>  	case MSR_IA32_MCG_CTL:
> >>>>>>  	case MSR_IA32_MCG_STATUS:
> >>>>>>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
> >>>>>>@@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> >>>>>>  	case MSR_KVM_ASYNC_PF_EN:
> >>>>>>  		data = vcpu->arch.apf.msr_val;
> >>>>>>  		break;
> >>>>>>+	case MSR_KVM_STEAL_TIME:
> >>>>>>+		data = vcpu->arch.st.msr_val;
> >>>>>>+		break;
> >>>>>>  	case MSR_IA32_P5_MC_ADDR:
> >>>>>>  	case MSR_IA32_P5_MC_TYPE:
> >>>>>>  	case MSR_IA32_MCG_CAP:
> >>>>>>@@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>>>>>  			kvm_migrate_timers(vcpu);
> >>>>>>  		vcpu->cpu = cpu;
> >>>>>>  	}
> >>>>>>+
> >>>>>>+	record_steal_time(vcpu);
> >>>>>>  }
> >>>>>>
> >>>>>>  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>>>>>@@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>>>>>  	kvm_x86_ops->vcpu_put(vcpu);
> >>>>>>  	kvm_put_guest_fpu(vcpu);
> >>>>>>  	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
> >>>>>>+	vcpu->arch.st.this_time_out = get_kernel_ns();
> >>>>>>  }
> >>>>>>
> >>>>>Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
> >>>>>just before/after entering/exiting a guest? vcpu_(put|get) are called
> >>>>>for each vcpu ioctl, not only VCPU_RUN.
> >>>>Sorry, missed that the first time I've read your e-mail.
> >>>>
> >>>>If done like you said, time spent on the hypervisor is accounted as
> >>>>steal time. I don't think it is.
> >>>I thought that this is the point of a steal time. Running other
> >>>tasks/guests is a hypervisor overhead too after all :) Also what about
> >>>time spend serving host interrupts between put/get? It will not be
> >>>accounted as steal time, correct?
> >>
> >>This is mostly semantics. I like to compare this to a normal
> >>process: There is a difference between time the OS spent on your
> >>behalf, doing your system calls (sys), and time spent by other
> >>processes. Similar thing here.
> >>
> >The problem with this approach is that things like doing "info cpus"
> >in qemu monitor will change guest scheduling behaviour. Do we want it
> >to be like that?
> 
> You mean because it is running in a different thread, and will
> compete for resources with the cpu thread ?
> 
No, because it executes GET_REGS ioctl (in vcpu thread). The time
it takes to execute it is accounted as time the hypervior spent on behave
of a guest. Not a big deal if it is executed rarely. I tend to use "info
cpus"/"info register" quite a lot when debugging and would preffer it to
not affect guest behaviour.


> >>Which put/get are you referring to specifically ? You mean
> >>vcpu_put() vs vcpu_load() ?
> >>
> >Yes.
> >
> >>If they are after vcpu_put(), they will, because at this time your
> >>process is officially out of the cpu.
> >>
> >And if they are between vcpu_load() and vcpu_put() they will be accounted as
> >a work done on behalf of a guest although they are likely unrelated.
> I think the best we can do around it here is record steal time /
> measure time as late as we can. We could in theory subtract irq
> time, but it sounds too complicated for little gain.
> 
Recording steal time/measure time as close as possible to vmentry/vmexit
is what I propose :) I agree about little gain part.

> >>
> >>>>
> >>>>Steal time is time spent running someone else's job instead of
> >>>>yours. The name for the time spent in the hypervisor doing something
> >>>>for *you* is just overhead.
> >>>OK. That is the question of a definition I guess. If you define it like
> >>>that the code is correct.
> >>>
> >>>>
> >>>>>
> >>>>>>  static int is_efer_nx(void)
> >>>>>>@@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >>>>>>  			     (1<<    KVM_FEATURE_NOP_IO_DELAY) |
> >>>>>>  			     (1<<    KVM_FEATURE_CLOCKSOURCE2) |
> >>>>>>  			     (1<<    KVM_FEATURE_ASYNC_PF) |
> >>>>>>-			     (1<<    KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> >>>>>>+			     (1<<    KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> >>>>>>+			     (1<<    KVM_FEATURE_STEAL_TIME);
> >>>>>>  		entry->ebx = 0;
> >>>>>>  		entry->ecx = 0;
> >>>>>>  		entry->edx = 0;
> >>>>>>@@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
> >>>>>>
> >>>>>>  	kvmclock_reset(vcpu);
> >>>>>>
> >>>>>>+	vcpu->arch.st.stime = 0;
> >>>>>>+
> >>>>>>  	kvm_clear_async_pf_completion_queue(vcpu);
> >>>>>>  	kvm_async_pf_hash_reset(vcpu);
> >>>>>>  	vcpu->arch.apf.halted = false;
> >>>>>>--
> >>>>>>1.7.3.4
> >>>>>>
> >>>>>>--
> >>>>>>To unsubscribe from this list: send the line "unsubscribe kvm" in
> >>>>>>the body of a message to majordomo@vger.kernel.org
> >>>>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>>>
> >>>>>--
> >>>>>			Gleb.
> >>>
> >>>--
> >>>			Gleb.
> >
> >--
> >			Gleb.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa June 16, 2011, 12:24 p.m. UTC | #11
On 06/16/2011 09:21 AM, Gleb Natapov wrote:
> On Thu, Jun 16, 2011 at 09:11:42AM -0300, Glauber Costa wrote:
>> On 06/16/2011 08:27 AM, Gleb Natapov wrote:
>>> On Thu, Jun 16, 2011 at 12:31:02AM -0300, Glauber Costa wrote:
>>>> On 06/15/2011 06:09 AM, Gleb Natapov wrote:
>>>>> On Wed, Jun 15, 2011 at 12:09:31AM -0300, Glauber Costa wrote:
>>>>>> On 06/14/2011 04:45 AM, Gleb Natapov wrote:
>>>>>>> On Mon, Jun 13, 2011 at 07:31:33PM -0400, Glauber Costa wrote:
>>>>>>>> To implement steal time, we need the hypervisor to pass the guest information
>>>>>>>> about how much time was spent running other processes outside the VM.
>>>>>>>> This is per-vcpu, and using the kvmclock structure for that is an abuse
>>>>>>>> we decided not to make.
>>>>>>>>
>>>>>>>> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
>>>>>>>> holds the memory area address containing information about steal time
>>>>>>>>
>>>>>>>> This patch contains the hypervisor part for it. I am keeping it separate from
>>>>>>>> the headers to facilitate backports to people who wants to backport the kernel
>>>>>>>> part but not the hypervisor, or the other way around.
>>>>>>>>
>>>>>>>> Signed-off-by: Glauber Costa<glommer@redhat.com>
>>>>>>>> CC: Rik van Riel<riel@redhat.com>
>>>>>>>> CC: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
>>>>>>>> CC: Peter Zijlstra<peterz@infradead.org>
>>>>>>>> CC: Avi Kivity<avi@redhat.com>
>>>>>>>> CC: Anthony Liguori<aliguori@us.ibm.com>
>>>>>>>> CC: Eric B Munson<emunson@mgebm.net>
>>>>>>>> ---
>>>>>>>>   arch/x86/include/asm/kvm_host.h |    8 +++++
>>>>>>>>   arch/x86/include/asm/kvm_para.h |    4 ++
>>>>>>>>   arch/x86/kvm/x86.c              |   60 +++++++++++++++++++++++++++++++++++++--
>>>>>>>>   3 files changed, 69 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>>>>>> index fc38eca..5dce014 100644
>>>>>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>>>>>> @@ -388,6 +388,14 @@ struct kvm_vcpu_arch {
>>>>>>>>   	unsigned int hw_tsc_khz;
>>>>>>>>   	unsigned int time_offset;
>>>>>>>>   	struct page *time_page;
>>>>>>>> +
>>>>>>>> +	struct {
>>>>>>>> +		u64 msr_val;
>>>>>>>> +		gpa_t stime;
>>>>>>>> +		struct kvm_steal_time steal;
>>>>>>>> +		u64 this_time_out;
>>>>>>>> +	} st;
>>>>>>>> +
>>>>>>>>   	u64 last_guest_tsc;
>>>>>>>>   	u64 last_kernel_ns;
>>>>>>>>   	u64 last_tsc_nsec;
>>>>>>>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>>>>>>>> index ac306c4..0341e61 100644
>>>>>>>> --- a/arch/x86/include/asm/kvm_para.h
>>>>>>>> +++ b/arch/x86/include/asm/kvm_para.h
>>>>>>>> @@ -45,6 +45,10 @@ struct kvm_steal_time {
>>>>>>>>   	__u32 pad[6];
>>>>>>>>   };
>>>>>>>>
>>>>>>>> +#define KVM_STEAL_ALIGNMENT_BITS 5
>>>>>>>> +#define KVM_STEAL_VALID_BITS ((-1ULL<<     (KVM_STEAL_ALIGNMENT_BITS + 1)))
>>>>>>>> +#define KVM_STEAL_RESERVED_MASK (((1<<     KVM_STEAL_ALIGNMENT_BITS) - 1 )<<     1)
>>>>>>>> +
>>>>>>>>   #define KVM_MAX_MMU_OP_BATCH           32
>>>>>>>>
>>>>>>>>   #define KVM_ASYNC_PF_ENABLED			(1<<     0)
>>>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>>>>> index 6645634..10fe028 100644
>>>>>>>> --- a/arch/x86/kvm/x86.c
>>>>>>>> +++ b/arch/x86/kvm/x86.c
>>>>>>>> @@ -797,12 +797,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
>>>>>>>>    * kvm-specific. Those are put in the beginning of the list.
>>>>>>>>    */
>>>>>>>>
>>>>>>>> -#define KVM_SAVE_MSRS_BEGIN	8
>>>>>>>> +#define KVM_SAVE_MSRS_BEGIN	9
>>>>>>>>   static u32 msrs_to_save[] = {
>>>>>>>>   	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
>>>>>>>>   	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
>>>>>>>>   	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
>>>>>>>> -	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
>>>>>>>> +	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
>>>>>>>>   	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>>>>>>>>   	MSR_STAR,
>>>>>>>>   #ifdef CONFIG_X86_64
>>>>>>>> @@ -1480,6 +1480,34 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
>>>>>>>>   	}
>>>>>>>>   }
>>>>>>>>
>>>>>>>> +static void record_steal_time(struct kvm_vcpu *vcpu)
>>>>>>>> +{
>>>>>>>> +	u64 delta;
>>>>>>>> +
>>>>>>>> +	if (vcpu->arch.st.stime&&     vcpu->arch.st.this_time_out) {
>>>>>>>> +
>>>>>>>> +		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
>>>>>>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>>>>>>> +
>>>>>>>> +			vcpu->arch.st.stime = 0;
>>>>>>>> +			return;
>>>>>>>> +		}
>>>>>>>> +
>>>>>>>> +		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
>>>>>>>> +
>>>>>>>> +		vcpu->arch.st.steal.steal += delta;
>>>>>>>> +		vcpu->arch.st.steal.version += 2;
>>>>>>>> +
>>>>>>>> +		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
>>>>>>> Why not use kvm_write_guest_cached() here and introduce kvm_read_guest_cached()
>>>>>>> for the read above?
>>>>>>
>>>>>> Actually, I'd expect most read/writes to benefit from caching, no?
>>>>>> So why don't we just rename kvm_write_guest_cached() to
>>>>>> kvm_write_guest(), and the few places - if any - that need to force
>>>>>> transversing of the gfn mappings, get renamed to
>>>>>> kvm_write_guest_uncached ?
>>>>>>
>>>>> Good idea. I do not see any places where kvm_write_guest_uncached is
>>>>> needed from a brief look. Avi?
>>>>>
>>>>>>>> +			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
>>>>>>>> +
>>>>>>>> +			vcpu->arch.st.stime = 0;
>>>>>>>> +			return;
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>   int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>>>>>>>   {
>>>>>>>>   	switch (msr) {
>>>>>>>> @@ -1562,6 +1590,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>>>>>>>>   		if (kvm_pv_enable_async_pf(vcpu, data))
>>>>>>>>   			return 1;
>>>>>>>>   		break;
>>>>>>>> +	case MSR_KVM_STEAL_TIME:
>>>>>>>> +		vcpu->arch.st.msr_val = data;
>>>>>>>> +
>>>>>>>> +		if (!(data&     KVM_MSR_ENABLED)) {
>>>>>>>> +			vcpu->arch.st.stime = 0;
>>>>>>>> +			break;
>>>>>>>> +		}
>>>>>>>> +
>>>>>>>> +		if (data&     KVM_STEAL_RESERVED_MASK)
>>>>>>>> +			return 1;
>>>>>>>> +
>>>>>>>> +		vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>>>>> +		vcpu->arch.st.stime = data&     KVM_STEAL_VALID_BITS;
>>>>>>>> +		record_steal_time(vcpu);
>>>>>>>> +
>>>>>>>> +		break;
>>>>>>>> +
>>>>>>>>   	case MSR_IA32_MCG_CTL:
>>>>>>>>   	case MSR_IA32_MCG_STATUS:
>>>>>>>>   	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
>>>>>>>> @@ -1847,6 +1892,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>>>>>>>>   	case MSR_KVM_ASYNC_PF_EN:
>>>>>>>>   		data = vcpu->arch.apf.msr_val;
>>>>>>>>   		break;
>>>>>>>> +	case MSR_KVM_STEAL_TIME:
>>>>>>>> +		data = vcpu->arch.st.msr_val;
>>>>>>>> +		break;
>>>>>>>>   	case MSR_IA32_P5_MC_ADDR:
>>>>>>>>   	case MSR_IA32_P5_MC_TYPE:
>>>>>>>>   	case MSR_IA32_MCG_CAP:
>>>>>>>> @@ -2158,6 +2206,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>>>>>>   			kvm_migrate_timers(vcpu);
>>>>>>>>   		vcpu->cpu = cpu;
>>>>>>>>   	}
>>>>>>>> +
>>>>>>>> +	record_steal_time(vcpu);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>>>>>> @@ -2165,6 +2215,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>>>>>>>   	kvm_x86_ops->vcpu_put(vcpu);
>>>>>>>>   	kvm_put_guest_fpu(vcpu);
>>>>>>>>   	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
>>>>>>>> +	vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>>>>>   }
>>>>>>>>
>>>>>>> Shouldn't we call record_steal_time(vcpu)/vcpu->arch.st.this_time_out = get_kernel_ns();
>>>>>>> just before/after entering/exiting a guest? vcpu_(put|get) are called
>>>>>>> for each vcpu ioctl, not only VCPU_RUN.
>>>>>> Sorry, missed that the first time I've read your e-mail.
>>>>>>
>>>>>> If done like you said, time spent on the hypervisor is accounted as
>>>>>> steal time. I don't think it is.
>>>>> I thought that this is the point of a steal time. Running other
>>>>> tasks/guests is a hypervisor overhead too after all :) Also what about
>>>>> time spend serving host interrupts between put/get? It will not be
>>>>> accounted as steal time, correct?
>>>>
>>>> This is mostly semantics. I like to compare this to a normal
>>>> process: There is a difference between time the OS spent on your
>>>> behalf, doing your system calls (sys), and time spent by other
>>>> processes. Similar thing here.
>>>>
>>> The problem with this approach is that things like doing "info cpus"
>>> in qemu monitor will change guest scheduling behaviour. Do we want it
>>> to be like that?
>>
>> You mean because it is running in a different thread, and will
>> compete for resources with the cpu thread ?
>>
> No, because it executes GET_REGS ioctl (in vcpu thread). The time
> it takes to execute it is accounted as time the hypervior spent on behave
> of a guest. Not a big deal if it is executed rarely. I tend to use "info
> cpus"/"info register" quite a lot when debugging and would preffer it to
> not affect guest behaviour.
>
>
>>>> Which put/get are you referring to specifically ? You mean
>>>> vcpu_put() vs vcpu_load() ?
>>>>
>>> Yes.
>>>
>>>> If they are after vcpu_put(), they will, because at this time your
>>>> process is officially out of the cpu.
>>>>
>>> And if they are between vcpu_load() and vcpu_put() they will be accounted as
>>> a work done on behalf of a guest although they are likely unrelated.
>> I think the best we can do around it here is record steal time /
>> measure time as late as we can. We could in theory subtract irq
>> time, but it sounds too complicated for little gain.
>>
> Recording steal time/measure time as close as possible to vmentry/vmexit
> is what I propose :) I agree about little gain part.

Well, I don't like it a priori, due to the reasons I've already stated. 
If there would be a large noticeable gain, there could be a trade off. 
But since you agree with the little gain, I'd prefer to keep it this way.

>>>>
>>>>>>
>>>>>> Steal time is time spent running someone else's job instead of
>>>>>> yours. The name for the time spent in the hypervisor doing something
>>>>>> for *you* is just overhead.
>>>>> OK. That is the question of a definition I guess. If you define it like
>>>>> that the code is correct.
>>>>>
>>>>>>
>>>>>>>
>>>>>>>>   static int is_efer_nx(void)
>>>>>>>> @@ -2477,7 +2528,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>>>>>>>>   			     (1<<     KVM_FEATURE_NOP_IO_DELAY) |
>>>>>>>>   			     (1<<     KVM_FEATURE_CLOCKSOURCE2) |
>>>>>>>>   			     (1<<     KVM_FEATURE_ASYNC_PF) |
>>>>>>>> -			     (1<<     KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
>>>>>>>> +			     (1<<     KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
>>>>>>>> +			     (1<<     KVM_FEATURE_STEAL_TIME);
>>>>>>>>   		entry->ebx = 0;
>>>>>>>>   		entry->ecx = 0;
>>>>>>>>   		entry->edx = 0;
>>>>>>>> @@ -6200,6 +6252,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
>>>>>>>>
>>>>>>>>   	kvmclock_reset(vcpu);
>>>>>>>>
>>>>>>>> +	vcpu->arch.st.stime = 0;
>>>>>>>> +
>>>>>>>>   	kvm_clear_async_pf_completion_queue(vcpu);
>>>>>>>>   	kvm_async_pf_hash_reset(vcpu);
>>>>>>>>   	vcpu->arch.apf.halted = false;
>>>>>>>> --
>>>>>>>> 1.7.3.4
>>>>>>>>
>>>>>>>> --
>>>>>>>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>>
>>>>>>> --
>>>>>>> 			Gleb.
>>>>>
>>>>> --
>>>>> 			Gleb.
>>>
>>> --
>>> 			Gleb.
>
> --
> 			Gleb.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 19, 2011, 12:35 p.m. UTC | #12
On 06/15/2011 12:09 PM, Gleb Natapov wrote:
> >
> >  Actually, I'd expect most read/writes to benefit from caching, no?
> >  So why don't we just rename kvm_write_guest_cached() to
> >  kvm_write_guest(), and the few places - if any - that need to force
> >  transversing of the gfn mappings, get renamed to
> >  kvm_write_guest_uncached ?
> >
> Good idea. I do not see any places where kvm_write_guest_uncached is
> needed from a brief look. Avi?
>

kvm_write_guest_cached() needs something to supply the cache, and needs 
recurring writes to the same location.  Neither of these are common (for 
example, instruction emulation doesn't have either).

> >
> >  If done like you said, time spent on the hypervisor is accounted as
> >  steal time. I don't think it is.
> I thought that this is the point of a steal time. Running other
> tasks/guests is a hypervisor overhead too after all :) Also what about
> time spend serving host interrupts between put/get? It will not be
> accounted as steal time, correct?

With accurate interrupt time accounting, it should be.  Otherwise 
general hypervisor overhead is not steal time.

(i.e. if the host is not overcommitted, steal time should be close to zero).
Gleb Natapov June 19, 2011, 12:59 p.m. UTC | #13
On Sun, Jun 19, 2011 at 03:35:58PM +0300, Avi Kivity wrote:
> On 06/15/2011 12:09 PM, Gleb Natapov wrote:
> >>
> >>  Actually, I'd expect most read/writes to benefit from caching, no?
> >>  So why don't we just rename kvm_write_guest_cached() to
> >>  kvm_write_guest(), and the few places - if any - that need to force
> >>  transversing of the gfn mappings, get renamed to
> >>  kvm_write_guest_uncached ?
> >>
> >Good idea. I do not see any places where kvm_write_guest_uncached is
> >needed from a brief look. Avi?
> >
> 
> kvm_write_guest_cached() needs something to supply the cache, and
> needs recurring writes to the same location.  Neither of these are
> common (for example, instruction emulation doesn't have either).
> 
Correct. Missed that. So what about changing steal time to use
kvm_write_guest_cached()?

> >>
> >>  If done like you said, time spent on the hypervisor is accounted as
> >>  steal time. I don't think it is.
> >I thought that this is the point of a steal time. Running other
> >tasks/guests is a hypervisor overhead too after all :) Also what about
> >time spend serving host interrupts between put/get? It will not be
> >accounted as steal time, correct?
> 
> With accurate interrupt time accounting, it should be.  Otherwise
> general hypervisor overhead is not steal time.
> 
> (i.e. if the host is not overcommitted, steal time should be close to zero).
> 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 19, 2011, 1:02 p.m. UTC | #14
On 06/19/2011 03:59 PM, Gleb Natapov wrote:
> On Sun, Jun 19, 2011 at 03:35:58PM +0300, Avi Kivity wrote:
> >  On 06/15/2011 12:09 PM, Gleb Natapov wrote:
> >  >>
> >  >>   Actually, I'd expect most read/writes to benefit from caching, no?
> >  >>   So why don't we just rename kvm_write_guest_cached() to
> >  >>   kvm_write_guest(), and the few places - if any - that need to force
> >  >>   transversing of the gfn mappings, get renamed to
> >  >>   kvm_write_guest_uncached ?
> >  >>
> >  >Good idea. I do not see any places where kvm_write_guest_uncached is
> >  >needed from a brief look. Avi?
> >  >
> >
> >  kvm_write_guest_cached() needs something to supply the cache, and
> >  needs recurring writes to the same location.  Neither of these are
> >  common (for example, instruction emulation doesn't have either).
> >
> Correct. Missed that. So what about changing steal time to use
> kvm_write_guest_cached()?

Makes sense, definitely.  Want to post read_guest_cached() as well?
Gleb Natapov June 20, 2011, 7:21 a.m. UTC | #15
On Sun, Jun 19, 2011 at 04:02:22PM +0300, Avi Kivity wrote:
> On 06/19/2011 03:59 PM, Gleb Natapov wrote:
> >On Sun, Jun 19, 2011 at 03:35:58PM +0300, Avi Kivity wrote:
> >>  On 06/15/2011 12:09 PM, Gleb Natapov wrote:
> >>  >>
> >>  >>   Actually, I'd expect most read/writes to benefit from caching, no?
> >>  >>   So why don't we just rename kvm_write_guest_cached() to
> >>  >>   kvm_write_guest(), and the few places - if any - that need to force
> >>  >>   transversing of the gfn mappings, get renamed to
> >>  >>   kvm_write_guest_uncached ?
> >>  >>
> >>  >Good idea. I do not see any places where kvm_write_guest_uncached is
> >>  >needed from a brief look. Avi?
> >>  >
> >>
> >>  kvm_write_guest_cached() needs something to supply the cache, and
> >>  needs recurring writes to the same location.  Neither of these are
> >>  common (for example, instruction emulation doesn't have either).
> >>
> >Correct. Missed that. So what about changing steal time to use
> >kvm_write_guest_cached()?
> 
> Makes sense, definitely.  Want to post read_guest_cached() as well?
> 
Glauber can you write read_guest_cached() as part of your series (should
be trivial), or do you want me to do it? I do not have a code to test it
with though :)

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 20, 2011, 8:02 a.m. UTC | #16
On 06/20/2011 10:21 AM, Gleb Natapov wrote:
> On Sun, Jun 19, 2011 at 04:02:22PM +0300, Avi Kivity wrote:
> >  On 06/19/2011 03:59 PM, Gleb Natapov wrote:
> >  >On Sun, Jun 19, 2011 at 03:35:58PM +0300, Avi Kivity wrote:
> >  >>   On 06/15/2011 12:09 PM, Gleb Natapov wrote:
> >  >>   >>
> >  >>   >>    Actually, I'd expect most read/writes to benefit from caching, no?
> >  >>   >>    So why don't we just rename kvm_write_guest_cached() to
> >  >>   >>    kvm_write_guest(), and the few places - if any - that need to force
> >  >>   >>    transversing of the gfn mappings, get renamed to
> >  >>   >>    kvm_write_guest_uncached ?
> >  >>   >>
> >  >>   >Good idea. I do not see any places where kvm_write_guest_uncached is
> >  >>   >needed from a brief look. Avi?
> >  >>   >
> >  >>
> >  >>   kvm_write_guest_cached() needs something to supply the cache, and
> >  >>   needs recurring writes to the same location.  Neither of these are
> >  >>   common (for example, instruction emulation doesn't have either).
> >  >>
> >  >Correct. Missed that. So what about changing steal time to use
> >  >kvm_write_guest_cached()?
> >
> >  Makes sense, definitely.  Want to post read_guest_cached() as well?
> >
> Glauber can you write read_guest_cached() as part of your series (should
> be trivial), or do you want me to do it? I do not have a code to test it
> with though :)

Yes.

(you can write it, and Glauber can include it in the series)
Glauber Costa June 20, 2011, 12:42 p.m. UTC | #17
On 06/20/2011 05:02 AM, Avi Kivity wrote:
> On 06/20/2011 10:21 AM, Gleb Natapov wrote:
>> On Sun, Jun 19, 2011 at 04:02:22PM +0300, Avi Kivity wrote:
>> > On 06/19/2011 03:59 PM, Gleb Natapov wrote:
>> > >On Sun, Jun 19, 2011 at 03:35:58PM +0300, Avi Kivity wrote:
>> > >> On 06/15/2011 12:09 PM, Gleb Natapov wrote:
>> > >> >>
>> > >> >> Actually, I'd expect most read/writes to benefit from caching,
>> no?
>> > >> >> So why don't we just rename kvm_write_guest_cached() to
>> > >> >> kvm_write_guest(), and the few places - if any - that need to
>> force
>> > >> >> transversing of the gfn mappings, get renamed to
>> > >> >> kvm_write_guest_uncached ?
>> > >> >>
>> > >> >Good idea. I do not see any places where
>> kvm_write_guest_uncached is
>> > >> >needed from a brief look. Avi?
>> > >> >
>> > >>
>> > >> kvm_write_guest_cached() needs something to supply the cache, and
>> > >> needs recurring writes to the same location. Neither of these are
>> > >> common (for example, instruction emulation doesn't have either).
>> > >>
>> > >Correct. Missed that. So what about changing steal time to use
>> > >kvm_write_guest_cached()?
>> >
>> > Makes sense, definitely. Want to post read_guest_cached() as well?
>> >
>> Glauber can you write read_guest_cached() as part of your series (should
>> be trivial), or do you want me to do it? I do not have a code to test it
>> with though :)
>
> Yes.
>
> (you can write it, and Glauber can include it in the series)
>
Write it, handle me the patch, I'll include it and test it.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc38eca..5dce014 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -388,6 +388,14 @@  struct kvm_vcpu_arch {
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	struct {
+		u64 msr_val;
+		gpa_t stime;
+		struct kvm_steal_time steal;
+		u64 this_time_out;
+	} st;
+
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ac306c4..0341e61 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -45,6 +45,10 @@  struct kvm_steal_time {
 	__u32 pad[6];
 };
 
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6645634..10fe028 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -797,12 +797,12 @@  EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	8
+#define KVM_SAVE_MSRS_BEGIN	9
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1480,6 +1480,34 @@  static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+	u64 delta;
+
+	if (vcpu->arch.st.stime && vcpu->arch.st.this_time_out) {
+
+		if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime,
+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
+
+			vcpu->arch.st.stime = 0;
+			return;
+		}
+
+		delta = (get_kernel_ns() - vcpu->arch.st.this_time_out);
+
+		vcpu->arch.st.steal.steal += delta;
+		vcpu->arch.st.steal.version += 2;
+
+		if (unlikely(kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime,
+			&vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) {
+
+			vcpu->arch.st.stime = 0;
+			return;
+		}
+	}
+
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1562,6 +1590,23 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		vcpu->arch.st.msr_val = data;
+
+		if (!(data & KVM_MSR_ENABLED)) {
+			vcpu->arch.st.stime = 0;
+			break;
+		}
+
+		if (data & KVM_STEAL_RESERVED_MASK)
+			return 1;
+
+		vcpu->arch.st.this_time_out = get_kernel_ns();
+		vcpu->arch.st.stime = data & KVM_STEAL_VALID_BITS;
+		record_steal_time(vcpu);
+
+		break;
+
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1847,6 +1892,9 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		data = vcpu->arch.st.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2158,6 +2206,8 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
+
+	record_steal_time(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2165,6 +2215,7 @@  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+	vcpu->arch.st.this_time_out = get_kernel_ns();
 }
 
 static int is_efer_nx(void)
@@ -2477,7 +2528,8 @@  static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+			     (1 << KVM_FEATURE_STEAL_TIME);
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -6200,6 +6252,8 @@  int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvmclock_reset(vcpu);
 
+	vcpu->arch.st.stime = 0;
+
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;