diff mbox

KVM: Use thread debug register storage instead of kvm specific data

Message ID 1251798248-13164-1-git-send-email-avi@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Avi Kivity Sept. 1, 2009, 9:44 a.m. UTC
Instead of saving the debug registers from the processor to a kvm data
structure, rely in the debug registers stored in the thread structure.
This allows us not to save dr6 and dr7.

Reduces lightweight vmexit cost by 350 cycles, or 11 percent.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |    3 ---
 arch/x86/kvm/x86.c              |   22 +++++++---------------
 2 files changed, 7 insertions(+), 18 deletions(-)

Comments

Avi Kivity Sept. 1, 2009, 9:47 a.m. UTC | #1
On 09/01/2009 12:44 PM, Avi Kivity wrote:
> Instead of saving the debug registers from the processor to a kvm data
> structure, rely in the debug registers stored in the thread structure.
> This allows us not to save dr6 and dr7.
>
> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
>    

Andrew, this is now available as the 'debugreg' branch of kvm.git.  
Given the massive performance improvement, it will be interesting to see 
how the test results change.

Marcelo, please queue this for 2.6.32, and I think it's even suitable 
for -stable.
Jan Kiszka Sept. 1, 2009, 10:42 a.m. UTC | #2
Avi Kivity wrote:
> Instead of saving the debug registers from the processor to a kvm data
> structure, rely in the debug registers stored in the thread structure.
> This allows us not to save dr6 and dr7.
> 
> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    3 ---
>  arch/x86/kvm/x86.c              |   22 +++++++---------------
>  2 files changed, 7 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6046e6f..45226f0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -362,9 +362,6 @@ struct kvm_vcpu_arch {
>  	u32 pat;
>  
>  	int switch_db_regs;
> -	unsigned long host_db[KVM_NR_DB_REGS];
> -	unsigned long host_dr6;
> -	unsigned long host_dr7;
>  	unsigned long db[KVM_NR_DB_REGS];
>  	unsigned long dr6;
>  	unsigned long dr7;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 891234b..9e3acbd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	kvm_guest_enter();
>  
> -	get_debugreg(vcpu->arch.host_dr6, 6);
> -	get_debugreg(vcpu->arch.host_dr7, 7);
>  	if (unlikely(vcpu->arch.switch_db_regs)) {
> -		get_debugreg(vcpu->arch.host_db[0], 0);
> -		get_debugreg(vcpu->arch.host_db[1], 1);
> -		get_debugreg(vcpu->arch.host_db[2], 2);
> -		get_debugreg(vcpu->arch.host_db[3], 3);
> -

I'm fine with this for the common case, but we should switch to the safe
pattern on test_thread_flag(TIF_DEBUG), ie. if someone debugs qemu using
hardware break/watchpoints.

>  		set_debugreg(0, 7);
>  		set_debugreg(vcpu->arch.eff_db[0], 0);
>  		set_debugreg(vcpu->arch.eff_db[1], 1);
> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	trace_kvm_entry(vcpu->vcpu_id);
>  	kvm_x86_ops->run(vcpu);
>  
> -	if (unlikely(vcpu->arch.switch_db_regs)) {
> -		set_debugreg(0, 7);
> -		set_debugreg(vcpu->arch.host_db[0], 0);
> -		set_debugreg(vcpu->arch.host_db[1], 1);
> -		set_debugreg(vcpu->arch.host_db[2], 2);
> -		set_debugreg(vcpu->arch.host_db[3], 3);
> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {

IIRC, you must clear dr7 before loading dr0..3 to avoid spurious effects.

> +		set_debugreg(current->thread.debugreg0, 0);
> +		set_debugreg(current->thread.debugreg1, 1);
> +		set_debugreg(current->thread.debugreg2, 2);
> +		set_debugreg(current->thread.debugreg3, 3);
> +		set_debugreg(current->thread.debugreg6, 6);
> +		set_debugreg(current->thread.debugreg7, 7);
>  	}
> -	set_debugreg(vcpu->arch.host_dr6, 6);
> -	set_debugreg(vcpu->arch.host_dr7, 7);

Unless I miss something ATM, moving this into the if-clause should cause
troubles if the guest leaves dr7 armed behind. But I need to refresh my
memory on this.

>  
>  	set_bit(KVM_REQ_KICK, &vcpu->requests);
>  	local_irq_enable();

Jan
Jan Kiszka Sept. 1, 2009, 11:08 a.m. UTC | #3
Jan Kiszka wrote:
> Avi Kivity wrote:
>> Instead of saving the debug registers from the processor to a kvm data
>> structure, rely in the debug registers stored in the thread structure.
>> This allows us not to save dr6 and dr7.
>>
>> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
>>
>> Signed-off-by: Avi Kivity <avi@redhat.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    3 ---
>>  arch/x86/kvm/x86.c              |   22 +++++++---------------
>>  2 files changed, 7 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 6046e6f..45226f0 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -362,9 +362,6 @@ struct kvm_vcpu_arch {
>>  	u32 pat;
>>  
>>  	int switch_db_regs;
>> -	unsigned long host_db[KVM_NR_DB_REGS];
>> -	unsigned long host_dr6;
>> -	unsigned long host_dr7;
>>  	unsigned long db[KVM_NR_DB_REGS];
>>  	unsigned long dr6;
>>  	unsigned long dr7;
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 891234b..9e3acbd 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>  
>>  	kvm_guest_enter();
>>  
>> -	get_debugreg(vcpu->arch.host_dr6, 6);
>> -	get_debugreg(vcpu->arch.host_dr7, 7);
>>  	if (unlikely(vcpu->arch.switch_db_regs)) {
>> -		get_debugreg(vcpu->arch.host_db[0], 0);
>> -		get_debugreg(vcpu->arch.host_db[1], 1);
>> -		get_debugreg(vcpu->arch.host_db[2], 2);
>> -		get_debugreg(vcpu->arch.host_db[3], 3);
>> -
> 
> I'm fine with this for the common case, but we should switch to the safe
> pattern on test_thread_flag(TIF_DEBUG), ie. if someone debugs qemu using
> hardware break/watchpoints.
> 
>>  		set_debugreg(0, 7);
>>  		set_debugreg(vcpu->arch.eff_db[0], 0);
>>  		set_debugreg(vcpu->arch.eff_db[1], 1);
>> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>  	trace_kvm_entry(vcpu->vcpu_id);
>>  	kvm_x86_ops->run(vcpu);
>>  
>> -	if (unlikely(vcpu->arch.switch_db_regs)) {
>> -		set_debugreg(0, 7);
>> -		set_debugreg(vcpu->arch.host_db[0], 0);
>> -		set_debugreg(vcpu->arch.host_db[1], 1);
>> -		set_debugreg(vcpu->arch.host_db[2], 2);
>> -		set_debugreg(vcpu->arch.host_db[3], 3);
>> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
> 
> IIRC, you must clear dr7 before loading dr0..3 to avoid spurious effects.
> 
>> +		set_debugreg(current->thread.debugreg0, 0);
>> +		set_debugreg(current->thread.debugreg1, 1);
>> +		set_debugreg(current->thread.debugreg2, 2);
>> +		set_debugreg(current->thread.debugreg3, 3);
>> +		set_debugreg(current->thread.debugreg6, 6);
>> +		set_debugreg(current->thread.debugreg7, 7);
>>  	}
>> -	set_debugreg(vcpu->arch.host_dr6, 6);
>> -	set_debugreg(vcpu->arch.host_dr7, 7);
> 
> Unless I miss something ATM, moving this into the if-clause should cause
> troubles if the guest leaves dr7 armed behind. But I need to refresh my
> memory on this.

Looks like VMX loads 0x400 unconditionally into dr7 on vmexit, thus all
hw debugging is initially disabled, I must have missed this while
reworking the dr switch code. So we can safely load dr0..3 without
clearing dr7 first, and we can skip restoring of dr6 and dr7 unless the
current process (the hypervisor) is debugged itself.

> 
>>  
>>  	set_bit(KVM_REQ_KICK, &vcpu->requests);
>>  	local_irq_enable();
> 
> Jan
> 

Recent Intel CPUs seem to provide control over dr7/debugctl msr
saving/restoring (bit 2 in vmexit and vmentry control). Allready thought
about if this may further help to reduce the common case (ie. no host
and guest drX usage)?

Jan
Avi Kivity Sept. 1, 2009, 11:16 a.m. UTC | #4
On 09/01/2009 01:42 PM, Jan Kiszka wrote:
>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 891234b..9e3acbd 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>
>>   	kvm_guest_enter();
>>
>> -	get_debugreg(vcpu->arch.host_dr6, 6);
>> -	get_debugreg(vcpu->arch.host_dr7, 7);
>>   	if (unlikely(vcpu->arch.switch_db_regs)) {
>> -		get_debugreg(vcpu->arch.host_db[0], 0);
>> -		get_debugreg(vcpu->arch.host_db[1], 1);
>> -		get_debugreg(vcpu->arch.host_db[2], 2);
>> -		get_debugreg(vcpu->arch.host_db[3], 3);
>> -
>>      
> I'm fine with this for the common case, but we should switch to the safe
> pattern on test_thread_flag(TIF_DEBUG), ie. if someone debugs qemu using
> hardware break/watchpoints.
>    

IIUC, thread.debugregs should be synced with the hardware registers.  
The kernel itself only writes to the debug registers, so we're safe 
doing the same.

>    
>>   		set_debugreg(0, 7);
>>   		set_debugreg(vcpu->arch.eff_db[0], 0);
>>   		set_debugreg(vcpu->arch.eff_db[1], 1);
>> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>   	trace_kvm_entry(vcpu->vcpu_id);
>>   	kvm_x86_ops->run(vcpu);
>>
>> -	if (unlikely(vcpu->arch.switch_db_regs)) {
>> -		set_debugreg(0, 7);
>> -		set_debugreg(vcpu->arch.host_db[0], 0);
>> -		set_debugreg(vcpu->arch.host_db[1], 1);
>> -		set_debugreg(vcpu->arch.host_db[2], 2);
>> -		set_debugreg(vcpu->arch.host_db[3], 3);
>> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
>>      
> IIRC, you must clear dr7 before loading dr0..3 to avoid spurious effects.
>    

Yeah, I thought of it but forgot to implement it.

>> +		set_debugreg(current->thread.debugreg0, 0);
>> +		set_debugreg(current->thread.debugreg1, 1);
>> +		set_debugreg(current->thread.debugreg2, 2);
>> +		set_debugreg(current->thread.debugreg3, 3);
>> +		set_debugreg(current->thread.debugreg6, 6);
>> +		set_debugreg(current->thread.debugreg7, 7);
>>   	}
>> -	set_debugreg(vcpu->arch.host_dr6, 6);
>> -	set_debugreg(vcpu->arch.host_dr7, 7);
>>      
> Unless I miss something ATM, moving this into the if-clause should cause
> troubles if the guest leaves dr7 armed behind. But I need to refresh my
> memory on this.
>    

The hardware will clear dr7 on exit.

Updated patch to follow.
Avi Kivity Sept. 1, 2009, 11:21 a.m. UTC | #5
On 09/01/2009 02:16 PM, Avi Kivity wrote:
>
>>>           set_debugreg(0, 7);
>>>           set_debugreg(vcpu->arch.eff_db[0], 0);
>>>           set_debugreg(vcpu->arch.eff_db[1], 1);
>>> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu 
>>> *vcpu)
>>>       trace_kvm_entry(vcpu->vcpu_id);
>>>       kvm_x86_ops->run(vcpu);
>>>
>>> -    if (unlikely(vcpu->arch.switch_db_regs)) {
>>> -        set_debugreg(0, 7);
>>> -        set_debugreg(vcpu->arch.host_db[0], 0);
>>> -        set_debugreg(vcpu->arch.host_db[1], 1);
>>> -        set_debugreg(vcpu->arch.host_db[2], 2);
>>> -        set_debugreg(vcpu->arch.host_db[3], 3);
>>> +    if (unlikely(vcpu->arch.switch_db_regs || 
>>> test_thread_flag(TIF_DEBUG))) {
>> IIRC, you must clear dr7 before loading dr0..3 to avoid spurious 
>> effects.
>
> Yeah, I thought of it but forgot to implement it.

Actually on the entry path it is done correctly, and on the exit path 
dr7 is cleared, so this patch is correct.
Marcelo Tosatti Sept. 1, 2009, 11:22 a.m. UTC | #6
On Tue, Sep 01, 2009 at 12:44:08PM +0300, Avi Kivity wrote:
> Instead of saving the debug registers from the processor to a kvm data
> structure, rely in the debug registers stored in the thread structure.
> This allows us not to save dr6 and dr7.
> 
> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.

Is this kgdb safe?

> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    3 ---
>  arch/x86/kvm/x86.c              |   22 +++++++---------------
>  2 files changed, 7 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6046e6f..45226f0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -362,9 +362,6 @@ struct kvm_vcpu_arch {
>  	u32 pat;
>  
>  	int switch_db_regs;
> -	unsigned long host_db[KVM_NR_DB_REGS];
> -	unsigned long host_dr6;
> -	unsigned long host_dr7;
>  	unsigned long db[KVM_NR_DB_REGS];
>  	unsigned long dr6;
>  	unsigned long dr7;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 891234b..9e3acbd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	kvm_guest_enter();
>  
> -	get_debugreg(vcpu->arch.host_dr6, 6);
> -	get_debugreg(vcpu->arch.host_dr7, 7);
>  	if (unlikely(vcpu->arch.switch_db_regs)) {
> -		get_debugreg(vcpu->arch.host_db[0], 0);
> -		get_debugreg(vcpu->arch.host_db[1], 1);
> -		get_debugreg(vcpu->arch.host_db[2], 2);
> -		get_debugreg(vcpu->arch.host_db[3], 3);
> -
>  		set_debugreg(0, 7);
>  		set_debugreg(vcpu->arch.eff_db[0], 0);
>  		set_debugreg(vcpu->arch.eff_db[1], 1);
> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	trace_kvm_entry(vcpu->vcpu_id);
>  	kvm_x86_ops->run(vcpu);
>  
> -	if (unlikely(vcpu->arch.switch_db_regs)) {
> -		set_debugreg(0, 7);
> -		set_debugreg(vcpu->arch.host_db[0], 0);
> -		set_debugreg(vcpu->arch.host_db[1], 1);
> -		set_debugreg(vcpu->arch.host_db[2], 2);
> -		set_debugreg(vcpu->arch.host_db[3], 3);
> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
> +		set_debugreg(current->thread.debugreg0, 0);
> +		set_debugreg(current->thread.debugreg1, 1);
> +		set_debugreg(current->thread.debugreg2, 2);
> +		set_debugreg(current->thread.debugreg3, 3);
> +		set_debugreg(current->thread.debugreg6, 6);
> +		set_debugreg(current->thread.debugreg7, 7);
>  	}
> -	set_debugreg(vcpu->arch.host_dr6, 6);
> -	set_debugreg(vcpu->arch.host_dr7, 7);
>  
>  	set_bit(KVM_REQ_KICK, &vcpu->requests);
>  	local_irq_enable();
> -- 
> 1.6.4.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Sept. 1, 2009, 11:28 a.m. UTC | #7
Marcelo Tosatti wrote:
> On Tue, Sep 01, 2009 at 12:44:08PM +0300, Avi Kivity wrote:
>> Instead of saving the debug registers from the processor to a kvm data
>> structure, rely in the debug registers stored in the thread structure.
>> This allows us not to save dr6 and dr7.
>>
>> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
> 
> Is this kgdb safe?

Nope, kgdb writes directly to the debug registers.

I vaguely recall someone trying to push a debug register management
framework. Did it hit mainline in the meantime? I do not find any trace
on quick glance, at least not in kgdb.

Jan

> 
>> Signed-off-by: Avi Kivity <avi@redhat.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    3 ---
>>  arch/x86/kvm/x86.c              |   22 +++++++---------------
>>  2 files changed, 7 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 6046e6f..45226f0 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -362,9 +362,6 @@ struct kvm_vcpu_arch {
>>  	u32 pat;
>>  
>>  	int switch_db_regs;
>> -	unsigned long host_db[KVM_NR_DB_REGS];
>> -	unsigned long host_dr6;
>> -	unsigned long host_dr7;
>>  	unsigned long db[KVM_NR_DB_REGS];
>>  	unsigned long dr6;
>>  	unsigned long dr7;
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 891234b..9e3acbd 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>  
>>  	kvm_guest_enter();
>>  
>> -	get_debugreg(vcpu->arch.host_dr6, 6);
>> -	get_debugreg(vcpu->arch.host_dr7, 7);
>>  	if (unlikely(vcpu->arch.switch_db_regs)) {
>> -		get_debugreg(vcpu->arch.host_db[0], 0);
>> -		get_debugreg(vcpu->arch.host_db[1], 1);
>> -		get_debugreg(vcpu->arch.host_db[2], 2);
>> -		get_debugreg(vcpu->arch.host_db[3], 3);
>> -
>>  		set_debugreg(0, 7);
>>  		set_debugreg(vcpu->arch.eff_db[0], 0);
>>  		set_debugreg(vcpu->arch.eff_db[1], 1);
>> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>  	trace_kvm_entry(vcpu->vcpu_id);
>>  	kvm_x86_ops->run(vcpu);
>>  
>> -	if (unlikely(vcpu->arch.switch_db_regs)) {
>> -		set_debugreg(0, 7);
>> -		set_debugreg(vcpu->arch.host_db[0], 0);
>> -		set_debugreg(vcpu->arch.host_db[1], 1);
>> -		set_debugreg(vcpu->arch.host_db[2], 2);
>> -		set_debugreg(vcpu->arch.host_db[3], 3);
>> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
>> +		set_debugreg(current->thread.debugreg0, 0);
>> +		set_debugreg(current->thread.debugreg1, 1);
>> +		set_debugreg(current->thread.debugreg2, 2);
>> +		set_debugreg(current->thread.debugreg3, 3);
>> +		set_debugreg(current->thread.debugreg6, 6);
>> +		set_debugreg(current->thread.debugreg7, 7);
>>  	}
>> -	set_debugreg(vcpu->arch.host_dr6, 6);
>> -	set_debugreg(vcpu->arch.host_dr7, 7);
>>  
>>  	set_bit(KVM_REQ_KICK, &vcpu->requests);
>>  	local_irq_enable();
>> -- 
>> 1.6.4.1
Marcelo Tosatti Sept. 1, 2009, 11:32 a.m. UTC | #8
On Tue, Sep 01, 2009 at 01:28:46PM +0200, Jan Kiszka wrote:
> Marcelo Tosatti wrote:
> > On Tue, Sep 01, 2009 at 12:44:08PM +0300, Avi Kivity wrote:
> >> Instead of saving the debug registers from the processor to a kvm data
> >> structure, rely in the debug registers stored in the thread structure.
> >> This allows us not to save dr6 and dr7.
> >>
> >> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
> > 
> > Is this kgdb safe?
> 
> Nope, kgdb writes directly to the debug registers.
> 
> I vaguely recall someone trying to push a debug register management
> framework. Did it hit mainline in the meantime? I do not find any trace
> on quick glance, at least not in kgdb.

A simple kgdb_enabled sort of flag, in addition to TIF_DEBUG would do
it?

> Jan
> 
> > 
> >> Signed-off-by: Avi Kivity <avi@redhat.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    3 ---
> >>  arch/x86/kvm/x86.c              |   22 +++++++---------------
> >>  2 files changed, 7 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 6046e6f..45226f0 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -362,9 +362,6 @@ struct kvm_vcpu_arch {
> >>  	u32 pat;
> >>  
> >>  	int switch_db_regs;
> >> -	unsigned long host_db[KVM_NR_DB_REGS];
> >> -	unsigned long host_dr6;
> >> -	unsigned long host_dr7;
> >>  	unsigned long db[KVM_NR_DB_REGS];
> >>  	unsigned long dr6;
> >>  	unsigned long dr7;
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 891234b..9e3acbd 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >>  
> >>  	kvm_guest_enter();
> >>  
> >> -	get_debugreg(vcpu->arch.host_dr6, 6);
> >> -	get_debugreg(vcpu->arch.host_dr7, 7);
> >>  	if (unlikely(vcpu->arch.switch_db_regs)) {
> >> -		get_debugreg(vcpu->arch.host_db[0], 0);
> >> -		get_debugreg(vcpu->arch.host_db[1], 1);
> >> -		get_debugreg(vcpu->arch.host_db[2], 2);
> >> -		get_debugreg(vcpu->arch.host_db[3], 3);
> >> -
> >>  		set_debugreg(0, 7);
> >>  		set_debugreg(vcpu->arch.eff_db[0], 0);
> >>  		set_debugreg(vcpu->arch.eff_db[1], 1);
> >> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >>  	trace_kvm_entry(vcpu->vcpu_id);
> >>  	kvm_x86_ops->run(vcpu);
> >>  
> >> -	if (unlikely(vcpu->arch.switch_db_regs)) {
> >> -		set_debugreg(0, 7);
> >> -		set_debugreg(vcpu->arch.host_db[0], 0);
> >> -		set_debugreg(vcpu->arch.host_db[1], 1);
> >> -		set_debugreg(vcpu->arch.host_db[2], 2);
> >> -		set_debugreg(vcpu->arch.host_db[3], 3);
> >> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
> >> +		set_debugreg(current->thread.debugreg0, 0);
> >> +		set_debugreg(current->thread.debugreg1, 1);
> >> +		set_debugreg(current->thread.debugreg2, 2);
> >> +		set_debugreg(current->thread.debugreg3, 3);
> >> +		set_debugreg(current->thread.debugreg6, 6);
> >> +		set_debugreg(current->thread.debugreg7, 7);
> >>  	}
> >> -	set_debugreg(vcpu->arch.host_dr6, 6);
> >> -	set_debugreg(vcpu->arch.host_dr7, 7);
> >>  
> >>  	set_bit(KVM_REQ_KICK, &vcpu->requests);
> >>  	local_irq_enable();
> >> -- 
> >> 1.6.4.1
> 
> -- 
> Siemens AG, Corporate Technology, CT SE 2
> Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Sept. 1, 2009, 11:33 a.m. UTC | #9
Jan Kiszka wrote:
> Marcelo Tosatti wrote:
>> On Tue, Sep 01, 2009 at 12:44:08PM +0300, Avi Kivity wrote:
>>> Instead of saving the debug registers from the processor to a kvm data
>>> structure, rely in the debug registers stored in the thread structure.
>>> This allows us not to save dr6 and dr7.
>>>
>>> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
>> Is this kgdb safe?
> 
> Nope, kgdb writes directly to the debug registers.
> 
> I vaguely recall someone trying to push a debug register management
> framework. Did it hit mainline in the meantime? I do not find any trace
> on quick glance, at least not in kgdb.

For the time being, falling back to conservative save/restore if kgdb is
configured in and kgdb_connected != 0 should allow us to apply this
optimization otherwise.

Jan
Avi Kivity Sept. 1, 2009, 11:34 a.m. UTC | #10
On 09/01/2009 02:22 PM, Marcelo Tosatti wrote:
> On Tue, Sep 01, 2009 at 12:44:08PM +0300, Avi Kivity wrote:
>    
>> Instead of saving the debug registers from the processor to a kvm data
>> structure, rely in the debug registers stored in the thread structure.
>> This allows us not to save dr6 and dr7.
>>
>> Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
>>      
> Is this kgdb safe?
>    

I don't think so - kgdb seems to access the debug registers directly.  I 
think we can live with it at the moment, and later add an API to 
arbitrate debug register usage among the various users (kgdb seems to be 
unsafe wrt userspace debug as well).
Avi Kivity Sept. 1, 2009, 11:35 a.m. UTC | #11
On 09/01/2009 02:32 PM, Marcelo Tosatti wrote:
>
>> Nope, kgdb writes directly to the debug registers.
>>
>> I vaguely recall someone trying to push a debug register management
>> framework. Did it hit mainline in the meantime? I do not find any trace
>> on quick glance, at least not in kgdb.
>>      
> A simple kgdb_enabled sort of flag, in addition to TIF_DEBUG would do
> it?
>    

kgdb is per-cpu whereas user debugging is per-task, so we'd need a 
per-cpu debug register cache and flag.
Avi Kivity Sept. 1, 2009, 11:43 a.m. UTC | #12
On 09/01/2009 02:33 PM, Jan Kiszka wrote:
> For the time being, falling back to conservative save/restore if kgdb is
> configured in and kgdb_connected != 0 should allow us to apply this
> optimization otherwise.
>    

I prefer a real fix instead of spaghettiing the code this way (and in 
the meanwhile I'm fine with kgdb hardware breakpoints not working while 
kvm is running).
Jan Kiszka Sept. 1, 2009, 11:45 a.m. UTC | #13
Avi Kivity wrote:
> On 09/01/2009 02:33 PM, Jan Kiszka wrote:
>> For the time being, falling back to conservative save/restore if kgdb is
>> configured in and kgdb_connected != 0 should allow us to apply this
>> optimization otherwise.
>>    
> 
> I prefer a real fix instead of spaghettiing the code this way (and in 
> the meanwhile I'm fine with kgdb hardware breakpoints not working while 
> kvm is running).

Then file a patch that makes CONFIG_KVM depend on !CONFIG_KGDB (or vice
versa). Silent breakages are evil.

Jan
Avi Kivity Sept. 1, 2009, 11:56 a.m. UTC | #14
On 09/01/2009 02:45 PM, Jan Kiszka wrote:
> Avi Kivity wrote:
>    
>> On 09/01/2009 02:33 PM, Jan Kiszka wrote:
>>      
>>> For the time being, falling back to conservative save/restore if kgdb is
>>> configured in and kgdb_connected != 0 should allow us to apply this
>>> optimization otherwise.
>>>
>>>        
>> I prefer a real fix instead of spaghettiing the code this way (and in
>> the meanwhile I'm fine with kgdb hardware breakpoints not working while
>> kvm is running).
>>      
> Then file a patch that makes CONFIG_KVM depend on !CONFIG_KGDB (or vice
> versa). Silent breakages are evil.
>    

Distros typically enable both.
Jan Kiszka Sept. 1, 2009, 12:01 p.m. UTC | #15
Avi Kivity wrote:
> On 09/01/2009 02:45 PM, Jan Kiszka wrote:
>> Avi Kivity wrote:
>>    
>>> On 09/01/2009 02:33 PM, Jan Kiszka wrote:
>>>      
>>>> For the time being, falling back to conservative save/restore if kgdb is
>>>> configured in and kgdb_connected != 0 should allow us to apply this
>>>> optimization otherwise.
>>>>
>>>>        
>>> I prefer a real fix instead of spaghettiing the code this way (and in
>>> the meanwhile I'm fine with kgdb hardware breakpoints not working while
>>> kvm is running).
>>>      
>> Then file a patch that makes CONFIG_KVM depend on !CONFIG_KGDB (or vice
>> versa). Silent breakages are evil.
>>    
> 
> Distros typically enable both.

Then we need to resolve it right from the start, not in some distant
future. We already have everything we need available, we could just try
to refactor it to a nicer interface (something that encapsulates both
compile and runtime tests).

Jan
Avi Kivity Sept. 1, 2009, 12:02 p.m. UTC | #16
On 09/01/2009 03:01 PM, Jan Kiszka wrote:
> Avi Kivity wrote:
>    
>> On 09/01/2009 02:45 PM, Jan Kiszka wrote:
>>      
>>> Avi Kivity wrote:
>>>
>>>        
>>>> On 09/01/2009 02:33 PM, Jan Kiszka wrote:
>>>>
>>>>          
>>>>> For the time being, falling back to conservative save/restore if kgdb is
>>>>> configured in and kgdb_connected != 0 should allow us to apply this
>>>>> optimization otherwise.
>>>>>
>>>>>
>>>>>            
>>>> I prefer a real fix instead of spaghettiing the code this way (and in
>>>> the meanwhile I'm fine with kgdb hardware breakpoints not working while
>>>> kvm is running).
>>>>
>>>>          
>>> Then file a patch that makes CONFIG_KVM depend on !CONFIG_KGDB (or vice
>>> versa). Silent breakages are evil.
>>>
>>>        
>> Distros typically enable both.
>>      
> Then we need to resolve it right from the start, not in some distant
> future. We already have everything we need available, we could just try
> to refactor it to a nicer interface (something that encapsulates both
> compile and runtime tests).
>    

I'll look at doing something.  It shouldn't be difficult.
Avi Kivity Sept. 1, 2009, 12:27 p.m. UTC | #17
On 09/01/2009 03:02 PM, Avi Kivity wrote:
>> Then we need to resolve it right from the start, not in some distant
>> future. We already have everything we need available, we could just try
>> to refactor it to a nicer interface (something that encapsulates both
>> compile and runtime tests).
>
>
> I'll look at doing something.  It shouldn't be difficult.
>

btw, it looks like kgdb is already broken wrt prtace hardware breakpoints.
Jan Kiszka Sept. 1, 2009, 1:31 p.m. UTC | #18
Avi Kivity wrote:
> On 09/01/2009 01:42 PM, Jan Kiszka wrote:
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 891234b..9e3acbd 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -3627,14 +3627,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>>
>>>   	kvm_guest_enter();
>>>
>>> -	get_debugreg(vcpu->arch.host_dr6, 6);
>>> -	get_debugreg(vcpu->arch.host_dr7, 7);
>>>   	if (unlikely(vcpu->arch.switch_db_regs)) {
>>> -		get_debugreg(vcpu->arch.host_db[0], 0);
>>> -		get_debugreg(vcpu->arch.host_db[1], 1);
>>> -		get_debugreg(vcpu->arch.host_db[2], 2);
>>> -		get_debugreg(vcpu->arch.host_db[3], 3);
>>> -
>>>      
>> I'm fine with this for the common case, but we should switch to the safe
>> pattern on test_thread_flag(TIF_DEBUG), ie. if someone debugs qemu using
>> hardware break/watchpoints.
>>    
> 
> IIUC, thread.debugregs should be synced with the hardware registers.  
> The kernel itself only writes to the debug registers, so we're safe 
> doing the same.

Yep, confirmed. There is only an unsync'ed window between debug trap
handling and signal injection, but I cannot imagine then kvm could
intercept this path with a guest entry.

> 
>>    
>>>   		set_debugreg(0, 7);
>>>   		set_debugreg(vcpu->arch.eff_db[0], 0);
>>>   		set_debugreg(vcpu->arch.eff_db[1], 1);
>>> @@ -3645,15 +3638,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>>   	trace_kvm_entry(vcpu->vcpu_id);
>>>   	kvm_x86_ops->run(vcpu);
>>>
>>> -	if (unlikely(vcpu->arch.switch_db_regs)) {
>>> -		set_debugreg(0, 7);
>>> -		set_debugreg(vcpu->arch.host_db[0], 0);
>>> -		set_debugreg(vcpu->arch.host_db[1], 1);
>>> -		set_debugreg(vcpu->arch.host_db[2], 2);
>>> -		set_debugreg(vcpu->arch.host_db[3], 3);
>>> +	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
>>>      
>> IIRC, you must clear dr7 before loading dr0..3 to avoid spurious effects.
>>    
> 
> Yeah, I thought of it but forgot to implement it.
> 
>>> +		set_debugreg(current->thread.debugreg0, 0);
>>> +		set_debugreg(current->thread.debugreg1, 1);
>>> +		set_debugreg(current->thread.debugreg2, 2);
>>> +		set_debugreg(current->thread.debugreg3, 3);
>>> +		set_debugreg(current->thread.debugreg6, 6);
>>> +		set_debugreg(current->thread.debugreg7, 7);
>>>   	}
>>> -	set_debugreg(vcpu->arch.host_dr6, 6);
>>> -	set_debugreg(vcpu->arch.host_dr7, 7);
>>>      
>> Unless I miss something ATM, moving this into the if-clause should cause
>> troubles if the guest leaves dr7 armed behind. But I need to refresh my
>> memory on this.
>>    
> 
> The hardware will clear dr7 on exit.

Given this fact, I wonder why you want to reload host dr0..3 on
vcpu->arch.switch_db_regs. if (unlikely(test_thread_flag(TIF_DEBUG)))
should suffice then, right?

Jan
Avi Kivity Sept. 1, 2009, 1:39 p.m. UTC | #19
On 09/01/2009 04:31 PM, Jan Kiszka wrote:
>> The hardware will clear dr7 on exit.
>>      
> Given this fact, I wonder why you want to reload host dr0..3 on
> vcpu->arch.switch_db_regs. if (unlikely(test_thread_flag(TIF_DEBUG)))
> should suffice then, right?
>    

I think you're right.  It would mean unsynced hardware and software 
registers, but only if debugging is disabled, which should be fine.
Andrew Theurer Sept. 1, 2009, 6:12 p.m. UTC | #20
On Tue, 2009-09-01 at 12:47 +0300, Avi Kivity wrote:
> On 09/01/2009 12:44 PM, Avi Kivity wrote:
> > Instead of saving the debug registers from the processor to a kvm data
> > structure, rely in the debug registers stored in the thread structure.
> > This allows us not to save dr6 and dr7.
> >
> > Reduces lightweight vmexit cost by 350 cycles, or 11 percent.
> >    
> 
> Andrew, this is now available as the 'debugreg' branch of kvm.git.  
> Given the massive performance improvement, it will be interesting to see 
> how the test results change.
> 
> Marcelo, please queue this for 2.6.32, and I think it's even suitable 
> for -stable.
> 

Here's a run from branch debugreg with thread debugreg storage +
conditionally reload dr6:

user  nice  system   irq  softirq guest   idle  iowait
5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
total busy: 36.97

Previous run that had avoided calling adjust_vmx_controls twice:

user  nice  system   irq  softirq guest   idle  iowait
5.81  0.00    9.48  0.08    1.04  21.32  57.86    4.41
total busy: 37.73

A relative reduction CPU cycles of 2%

new oprofile:

> samples  %        app name                 symbol name
> 876648   54.1555  kvm-intel.ko             vmx_vcpu_run
> 37595     2.3225  qemu-system-x86_64       cpu_physical_memory_rw
> 35623     2.2006  qemu-system-x86_64       phys_page_find_alloc
> 24874     1.5366  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 native_write_msr_safe
> 17710     1.0940  libc-2.5.so              memcpy
> 14664     0.9059  kvm.ko                   kvm_arch_vcpu_ioctl_run
> 14577     0.9005  qemu-system-x86_64       qemu_get_ram_ptr
> 12528     0.7739  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 native_read_msr_safe
> 10979     0.6782  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 copy_user_generic_string
> 9979      0.6165  qemu-system-x86_64       virtqueue_get_head
> 9371      0.5789  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 schedule
> 8333      0.5148  qemu-system-x86_64       virtqueue_avail_bytes
> 7899      0.4880  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 fget_light
> 7289      0.4503  qemu-system-x86_64       main_loop_wait
> 7217      0.4458  qemu-system-x86_64       lduw_phys
> 6821      0.4214  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 audit_syscall_exit
> 6749      0.4169  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 do_select
> 5919      0.3657  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 audit_syscall_entry
> 5466      0.3377  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 kfree
> 4887      0.3019  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 fput
> 4689      0.2897  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 __switch_to
> 4636      0.2864  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 mwait_idle
> 4505      0.2783  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 getnstimeofday
> 4453      0.2751  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 system_call
> 4403      0.2720  kvm.ko                   kvm_load_guest_fpu
> 4285      0.2647  kvm.ko                   kvm_put_guest_fpu
> 4241      0.2620  libpthread-2.5.so        pthread_mutex_lock
> 4172      0.2577  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 unroll_tree_refs
> 4100      0.2533  qemu-system-x86_64       kvm_run
> 4044      0.2498  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 __down_read
> 3978      0.2457  qemu-system-x86_64       ldl_phys
> 3669      0.2267  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 do_vfs_ioctl
> 3655      0.2258  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 __up_read
> 

A diff of this and previous run's oprofile:


> profile1 is [./oprofile.before]
> profile2 is [./oprofile.after]
> Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (No unit mask) count 10000000
> total samples (ts1) for profile1 is 1661542 
> total samples (ts2) for profile2 is 1618760 (includes multiplier of 1.000000)
> functions which have a abs(pct2-pct1) < 0.02 are not displayed
> 
>                               pct2:   pct1:                                      
>                                100*    100*  pct2                                
>        s1        s2   s2/s1  s2/ts1  s1/ts1  -pct1 symbol                     bin
> --------- --------- ------- ------- ------- ------ ------                     ---
>      1559      2747  1.76/1   0.165   0.094  0.071 dput                   vmlinux
>     34764     35623  1.02/1   2.144   2.092  0.052 phys_page_find_alloc      qemu
>      5170      5919  1.14/1   0.356   0.311  0.045 audit_syscall_entry    vmlinux
>      3593      4172  1.16/1   0.251   0.216  0.035 unroll_tree_refs       vmlinux
>       896      1414  1.58/1   0.085   0.054  0.031 kstat_irqs_cpu         vmlinux
>       199       676  3.40/1   0.041   0.012  0.029 /usr/lib64/perl5/5.8 libperl.so
>     14189     14664  1.03/1   0.883   0.854  0.029 kvm_arch_vcpu_ioctl_    kvm.ko
>      1314      1779  1.35/1   0.107   0.079  0.028 kvm_arch_post_kvm_ru      qemu
>      1390      1785  1.28/1   0.107   0.084  0.024 vfs_read               vmlinux
>      9015      9371  1.04/1   0.564   0.543  0.021 schedule               vmlinux
>      1265      1610  1.27/1   0.097   0.076  0.021 qemu_get_clock            qemu
>       600       941  1.57/1   0.057   0.036  0.021 preempt_notifier_reg   vmlinux
>      3339      2996  1/1.11   0.180   0.201 -0.021 virtqueue_num_heads       qemu
>       822       463  1/1.78   0.028   0.049 -0.022 vmx_vcpu_put               kvm
>      1116       743  1/1.50   0.045   0.067 -0.022 cap_file_ioctl         vmlinux
>      2967      2587  1/1.15   0.156   0.179 -0.023 native_read_tsc        vmlinux
>      3487      3096  1/1.13   0.186   0.210 -0.024 task_rq_lock           vmlinux
>      1780      1381  1/1.29   0.083   0.107 -0.024 math_state_restore     vmlinux
>     25278     24874  1/1.02   1.497   1.521 -0.024 native_write_msr_saf   vmlinux
>      1733      1329  1/1.30   0.080   0.104 -0.024 rw_verify_area         vmlinux
>      2305      1897  1/1.22   0.114   0.139 -0.025 __hrtimer_start_rang   vmlinux
>      1685      1273  1/1.32   0.077   0.101 -0.025 tcp_poll               vmlinux
>      1059       637  1/1.66   0.038   0.064 -0.025 kvm_lapic_get_cr8       kvm.ko
>      2855      2425  1/1.18   0.146   0.172 -0.026 signalfd_poll          vmlinux
>      4100      3669  1/1.12   0.221   0.247 -0.026 do_vfs_ioctl           vmlinux
>      1941      1498  1/1.30   0.090   0.117 -0.027 complete_pio            kvm.ko
>      3506      3052  1/1.15   0.184   0.211 -0.027 vmcs_writel                kvm
>     18205     17710  1/1.03   1.066   1.096 -0.030 memcpy                    libc
>      1717      1114  1/1.54   0.067   0.103 -0.036 rb_insert_color        vmlinux
>     10871      9979  1/1.09   0.601   0.654 -0.054 virtqueue_get_head        qemu
>      7805      6749  1/1.16   0.406   0.470 -0.064 do_select              vmlinux
>      9080      7899  1/1.15   0.475   0.546 -0.071 fget_light             vmlinux
>      3550         0           0.000   0.214 -0.214 native_get_debugreg    vmlinux
>    885444    876648  1/1.01  52.761  53.290 -0.529 vmx_vcpu_run               kvm
>     12380         0           0.000   0.745 -0.745 native_set_debugreg    vmlinux
>                             ------- ------- ------  
>                              94.916  97.469 -2.553  
> 

Notice native_set/get_debugreg is now not an issue :)

All of these still use qemu-kvm-87

-Andrew



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 1, 2009, 6:23 p.m. UTC | #21
On 09/01/2009 09:12 PM, Andrew Theurer wrote:
> Here's a run from branch debugreg with thread debugreg storage +
> conditionally reload dr6:
>
> user  nice  system   irq  softirq guest   idle  iowait
> 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
> total busy: 36.97
>
> Previous run that had avoided calling adjust_vmx_controls twice:
>
> user  nice  system   irq  softirq guest   idle  iowait
> 5.81  0.00    9.48  0.08    1.04  21.32  57.86    4.41
> total busy: 37.73
>
> A relative reduction CPU cycles of 2%
>    

That was an wasy fruit to pick.  To bad it was a regression that we 
introduced.

> new oprofile:
>
>    
>> samples  %        app name                 symbol name
>> 876648   54.1555  kvm-intel.ko             vmx_vcpu_run
>> 37595     2.3225  qemu-system-x86_64       cpu_physical_memory_rw
>> 35623     2.2006  qemu-system-x86_64       phys_page_find_alloc
>> 24874     1.5366  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 native_write_msr_safe
>> 17710     1.0940  libc-2.5.so              memcpy
>> 14664     0.9059  kvm.ko                   kvm_arch_vcpu_ioctl_run
>> 14577     0.9005  qemu-system-x86_64       qemu_get_ram_ptr
>> 12528     0.7739  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 native_read_msr_safe
>> 10979     0.6782  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 copy_user_generic_string
>> 9979      0.6165  qemu-system-x86_64       virtqueue_get_head
>> 9371      0.5789  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 schedule
>> 8333      0.5148  qemu-system-x86_64       virtqueue_avail_bytes
>> 7899      0.4880  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 fget_light
>> 7289      0.4503  qemu-system-x86_64       main_loop_wait
>> 7217      0.4458  qemu-system-x86_64       lduw_phys
>>      

This is almost entirely host virtio.  I can reduce native_write_msr_safe 
by a bit, but not much.

>> 6821      0.4214  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 audit_syscall_exit
>> 6749      0.4169  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 do_select
>> 5919      0.3657  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 audit_syscall_entry
>> 5466      0.3377  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 kfree
>> 4887      0.3019  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 fput
>> 4689      0.2897  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 __switch_to
>> 4636      0.2864  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 mwait_idle
>>      

Still not idle=poll, it may shave off 0.2%.
Andrew Theurer Sept. 4, 2009, 2:48 p.m. UTC | #22
On Tue, 2009-09-01 at 21:23 +0300, Avi Kivity wrote:
> On 09/01/2009 09:12 PM, Andrew Theurer wrote:
> > Here's a run from branch debugreg with thread debugreg storage +
> > conditionally reload dr6:
> >
> > user  nice  system   irq  softirq guest   idle  iowait
> > 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
> > total busy: 36.97
> >
> > Previous run that had avoided calling adjust_vmx_controls twice:
> >
> > user  nice  system   irq  softirq guest   idle  iowait
> > 5.81  0.00    9.48  0.08    1.04  21.32  57.86    4.41
> > total busy: 37.73
> >
> > A relative reduction CPU cycles of 2%
> >    
> 
> That was an wasy fruit to pick.  To bad it was a regression that we 
> introduced.
> 
> > new oprofile:
> >
> >    
> >> samples  %        app name                 symbol name
> >> 876648   54.1555  kvm-intel.ko             vmx_vcpu_run
> >> 37595     2.3225  qemu-system-x86_64       cpu_physical_memory_rw
> >> 35623     2.2006  qemu-system-x86_64       phys_page_find_alloc
> >> 24874     1.5366  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 native_write_msr_safe
> >> 17710     1.0940  libc-2.5.so              memcpy
> >> 14664     0.9059  kvm.ko                   kvm_arch_vcpu_ioctl_run
> >> 14577     0.9005  qemu-system-x86_64       qemu_get_ram_ptr
> >> 12528     0.7739  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 native_read_msr_safe
> >> 10979     0.6782  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 copy_user_generic_string
> >> 9979      0.6165  qemu-system-x86_64       virtqueue_get_head
> >> 9371      0.5789  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 schedule
> >> 8333      0.5148  qemu-system-x86_64       virtqueue_avail_bytes
> >> 7899      0.4880  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 fget_light
> >> 7289      0.4503  qemu-system-x86_64       main_loop_wait
> >> 7217      0.4458  qemu-system-x86_64       lduw_phys
> >>      
> 
> This is almost entirely host virtio.  I can reduce native_write_msr_safe 
> by a bit, but not much.
> 
> >> 6821      0.4214  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 audit_syscall_exit
> >> 6749      0.4169  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 do_select
> >> 5919      0.3657  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 audit_syscall_entry
> >> 5466      0.3377  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 kfree
> >> 4887      0.3019  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 fput
> >> 4689      0.2897  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 __switch_to
> >> 4636      0.2864  vmlinux-2.6.31-rc5_debugreg_v2.6.31-rc3-3441-g479fa73-autokern1 mwait_idle
> >>      
> 
> Still not idle=poll, it may shave off 0.2%.

Won't this affect SMT in a negative way?  (OK, I am not running SMT now,
but eventually we will be) A long time ago, we tested P4's with HT, and
a polling idle in one thread always negatively impacted performance in
the sibling thread.

FWIW, I did try idle=halt, and it was slightly worse.

I did get a chance to try the latest qemu (master and next heads).  I
have been running into a problem with virtIO stor driver for windows on
anything much newer than kvm-87.  I compiled the driver from the new git
tree, installed OK, but still had the same error.  Finally, I removed
the serial number feature in the virtio-blk in qemu, and I can now get
the driver to work in Windows.

So, not really any good news on performance with latest qemu builds.
Performance is slightly worse:

qemu-kvm-87
user  nice  system   irq  softirq guest   idle  iowait
5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
total busy: 36.97

qemu-kvm-88-905-g6025b2d (master)
user  nice  system   irq  softirq guest   idle  iowait
6.57  0.00   10.86  0.08     1.02 21.35  55.90    4.21
total busy: 39.89

qemu-kvm-88-910-gbf8a05b (next)
user  nice  system   irq  softirq guest   idle  iowait
6.60  0.00  10.91   0.09     1.03 21.35  55.71    4.31
total busy: 39.98

diff of profiles, p1=qemu-kvm-87, p2=qemu-master


> profile1 is qemu-kvm-87
> profile2 is qemu-master
> Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (No unit mask) count 10000000
> total samples (ts1) for profile1 is 1616921 
> total samples (ts2) for profile2 is 1752347 (includes multiplier of 0.995420)
> functions which have a abs(pct2-pct1) < 0.06 are not displayed
> 
>                               pct2:   pct1:                                      
>                                100*    100*  pct2                                
>        s1        s2   s2/s1  s2/ts1  s1/ts1  -pct1 symbol                     bin
> --------- --------- ------- ------- ------- ------ ------                     ---
>    879611    907883  1.03/1  56.149  54.400  1.749 vmx_vcpu_run               kvm
>       614     11553 18.82/1   0.715   0.038  0.677 gfn_to_memslot_unali    kvm.ko
>     34511     44922  1.30/1   2.778   2.134  0.644 phys_page_find_alloc      qemu
>      2866      9334  3.26/1   0.577   0.177  0.400 paging64_walk_addr      kvm.ko
>     11139     17200  1.54/1   1.064   0.689  0.375 copy_user_generic_st   vmlinux
>      3100      7108  2.29/1   0.440   0.192  0.248 x86_decode_insn         kvm.ko
>      8169     11873  1.45/1   0.734   0.505  0.229 virtqueue_avail_byte      qemu
>      1103      4540  4.12/1   0.281   0.068  0.213 kvm_read_guest          kvm.ko
>     17427     20401  1.17/1   1.262   1.078  0.184 memcpy                    libc
>         0      2905           0.180   0.000  0.180 gfn_to_pfn              kvm.ko
>      1831      4328  2.36/1   0.268   0.113  0.154 x86_emulate_insn        kvm.ko
>        65      2431 37.41/1   0.150   0.004  0.146 emulator_read_emulat    kvm.ko
>     14922     17196  1.15/1   1.064   0.923  0.141 qemu_get_ram_ptr          qemu
>       545      2724  5.00/1   0.168   0.034  0.135 emulate_instruction     kvm.ko
>       599      2464  4.11/1   0.152   0.037  0.115 kvm_read_guest_page     kvm.ko
>       503      2355  4.68/1   0.146   0.031  0.115 gfn_to_hva              kvm.ko
>      1076      2918  2.71/1   0.181   0.067  0.114 memcpy_c               vmlinux
>       594      2241  3.77/1   0.139   0.037  0.102 next_segment            kvm.ko
>      1680      3248  1.93/1   0.201   0.104  0.097 pipe_poll              vmlinux
>         0      1463           0.090   0.000  0.090 subpage_readl             qemu
>         0      1363           0.084   0.000  0.084 msix_enabled              qemu
>       527      1883  3.57/1   0.116   0.033  0.084 paging64_gpte_to_gfn    kvm.ko
>       962      2223  2.31/1   0.138   0.059  0.078 do_insn_fetch           kvm.ko
>       348      1605  4.61/1   0.099   0.022  0.078 is_rsvd_bits_set        kvm.ko
>       520      1763  3.39/1   0.109   0.032  0.077 unalias_gfn             kvm.ko
>         1      1163 1163.65/1   0.072   0.000  0.072 tdp_page_fault          kvm.ko
>      3827      4912  1.28/1   0.304   0.237  0.067 __down_read            vmlinux
>         0      1014           0.063   0.000  0.063 mapping_level           kvm.ko
>       973         0           0.000   0.060 -0.060 pm_ioport_readl           qemu
>      1635       528  1/3.09   0.033   0.101 -0.068 ioport_read               qemu
>      2179      1017  1/2.14   0.063   0.135 -0.072 kvm_emulate_pio         kvm.ko
>     25141     23722  1/1.06   1.467   1.555 -0.088 native_write_msr_saf   vmlinux
>      1560         0           0.000   0.096 -0.096 eventfd_poll           vmlinux
>                             ------- ------- ------  
>                             105.100  97.450  7.650  


18x more samples for gfn_to_memslot_unali*, 37x for
emulator_read_emula*, and more CPU time in guest mode.

One other thing I decided to try was some cpu binding.  I know this is
not practical for production, but I wanted to see if there's any benefit
at all.  One reason was that a coworker here tried binding the qemu
thread for the vcpu and the qemu IO thread to the same cpu.  On a
networking test, guest->local-host, throughput was up about 2x.
Obviously there was a nice effect of being on the same cache.  I
wondered, even without full bore throughput tests, could we see any
benefit here.  So, I bound each pair of VMs to a dedicated core.  What I
saw was about a 6% improvement in performance.  For a system which has
pretty incredible memory performance and is not that busy, I was
surprised that I got 6%.  I am not advocating binding, but what I do
wonder:  on 1-way VMs, if we keep all the qemu threads together on the
same CPU, but still allowing the scheduler to move them (all of them at
once) to different cpus over time, would we see the same benefit?

One other thing:  So far I have not been using preadv/pwritev.  I assume
I need a more recent glibc (on 2.5 now) for qemu to take advantage of
this?

Thanks!

-Andrew


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Iggy Jackson Sept. 4, 2009, 3:30 p.m. UTC | #23
On Friday 04 September 2009 09:48:17 am Andrew Theurer wrote:
<snip>
> >
> > Still not idle=poll, it may shave off 0.2%.
> 
> Won't this affect SMT in a negative way?  (OK, I am not running SMT now,
> but eventually we will be) A long time ago, we tested P4's with HT, and
> a polling idle in one thread always negatively impacted performance in
> the sibling thread.
> 
> FWIW, I did try idle=halt, and it was slightly worse.
> 
> I did get a chance to try the latest qemu (master and next heads).  I
> have been running into a problem with virtIO stor driver for windows on
> anything much newer than kvm-87.  I compiled the driver from the new git
> tree, installed OK, but still had the same error.  Finally, I removed
> the serial number feature in the virtio-blk in qemu, and I can now get
> the driver to work in Windows.

What were the symptoms you were seeing (i.e. define "a problem").

> 
> So, not really any good news on performance with latest qemu builds.
> Performance is slightly worse:
> 
> qemu-kvm-87
> user  nice  system   irq  softirq guest   idle  iowait
> 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
> total busy: 36.97
> 
> qemu-kvm-88-905-g6025b2d (master)
> user  nice  system   irq  softirq guest   idle  iowait
> 6.57  0.00   10.86  0.08     1.02 21.35  55.90    4.21
> total busy: 39.89
> 
> qemu-kvm-88-910-gbf8a05b (next)
> user  nice  system   irq  softirq guest   idle  iowait
> 6.60  0.00  10.91   0.09     1.03 21.35  55.71    4.31
> total busy: 39.98
> 
> diff of profiles, p1=qemu-kvm-87, p2=qemu-master
> 
<snip>
> 
> 18x more samples for gfn_to_memslot_unali*, 37x for
> emulator_read_emula*, and more CPU time in guest mode.
> 
> One other thing I decided to try was some cpu binding.  I know this is
> not practical for production, but I wanted to see if there's any benefit
> at all.  One reason was that a coworker here tried binding the qemu
> thread for the vcpu and the qemu IO thread to the same cpu.  On a
> networking test, guest->local-host, throughput was up about 2x.
> Obviously there was a nice effect of being on the same cache.  I
> wondered, even without full bore throughput tests, could we see any
> benefit here.  So, I bound each pair of VMs to a dedicated core.  What I
> saw was about a 6% improvement in performance.  For a system which has
> pretty incredible memory performance and is not that busy, I was
> surprised that I got 6%.  I am not advocating binding, but what I do
> wonder:  on 1-way VMs, if we keep all the qemu threads together on the
> same CPU, but still allowing the scheduler to move them (all of them at
> once) to different cpus over time, would we see the same benefit?
> 
> One other thing:  So far I have not been using preadv/pwritev.  I assume
> I need a more recent glibc (on 2.5 now) for qemu to take advantage of
> this?

Getting p(read|write)v working almost doubled my virtio-net throughput in a 
Linux guest. Not quite as much in Windows guests. Yes you need glibc-2.10. I 
think some distros might have backported it to 2.9. You will also need some 
support for it in your system includes.

--Iggy

> 
> Thanks!
> 
> -Andrew
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Theurer Sept. 4, 2009, 4:08 p.m. UTC | #24
Brian Jackson wrote:
> On Friday 04 September 2009 09:48:17 am Andrew Theurer wrote:
> <snip>
>>> Still not idle=poll, it may shave off 0.2%.
>> Won't this affect SMT in a negative way?  (OK, I am not running SMT now,
>> but eventually we will be) A long time ago, we tested P4's with HT, and
>> a polling idle in one thread always negatively impacted performance in
>> the sibling thread.
>>
>> FWIW, I did try idle=halt, and it was slightly worse.
>>
>> I did get a chance to try the latest qemu (master and next heads).  I
>> have been running into a problem with virtIO stor driver for windows on
>> anything much newer than kvm-87.  I compiled the driver from the new git
>> tree, installed OK, but still had the same error.  Finally, I removed
>> the serial number feature in the virtio-blk in qemu, and I can now get
>> the driver to work in Windows.
> 
> What were the symptoms you were seeing (i.e. define "a problem").

Device manager reports "a problem code 10" occurred, and the driver 
cannot initialize.

Vadim Rozenfeld informed me:
> There is a sanity check in the code, which checks the I/O range and fails if is not equal to 40h.
> Resent virtio-blk devices have I/O range equal to 0x400 (serial number feature). So, out signed  viostor driver will fail on the latest KVMs. This problem was fixed 
and committed to SVN some time ago.

I assumed the fix was to the virtio windows driver, but I could not get 
the driver I compiled from latest git to work either (only on 
qemu-kvm-87).  So, I just backed out the serial number feature in qemu, 
and it worked.  FWIW, the linux virtio-blk driver never had a problem.

> 
>> So, not really any good news on performance with latest qemu builds.
>> Performance is slightly worse:
>>
>> qemu-kvm-87
>> user  nice  system   irq  softirq guest   idle  iowait
>> 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
>> total busy: 36.97
>>
>> qemu-kvm-88-905-g6025b2d (master)
>> user  nice  system   irq  softirq guest   idle  iowait
>> 6.57  0.00   10.86  0.08     1.02 21.35  55.90    4.21
>> total busy: 39.89
>>
>> qemu-kvm-88-910-gbf8a05b (next)
>> user  nice  system   irq  softirq guest   idle  iowait
>> 6.60  0.00  10.91   0.09     1.03 21.35  55.71    4.31
>> total busy: 39.98
>>
>> diff of profiles, p1=qemu-kvm-87, p2=qemu-master
>>
> <snip>
>> 18x more samples for gfn_to_memslot_unali*, 37x for
>> emulator_read_emula*, and more CPU time in guest mode.
>>
>> One other thing I decided to try was some cpu binding.  I know this is
>> not practical for production, but I wanted to see if there's any benefit
>> at all.  One reason was that a coworker here tried binding the qemu
>> thread for the vcpu and the qemu IO thread to the same cpu.  On a
>> networking test, guest->local-host, throughput was up about 2x.
>> Obviously there was a nice effect of being on the same cache.  I
>> wondered, even without full bore throughput tests, could we see any
>> benefit here.  So, I bound each pair of VMs to a dedicated core.  What I
>> saw was about a 6% improvement in performance.  For a system which has
>> pretty incredible memory performance and is not that busy, I was
>> surprised that I got 6%.  I am not advocating binding, but what I do
>> wonder:  on 1-way VMs, if we keep all the qemu threads together on the
>> same CPU, but still allowing the scheduler to move them (all of them at
>> once) to different cpus over time, would we see the same benefit?
>>
>> One other thing:  So far I have not been using preadv/pwritev.  I assume
>> I need a more recent glibc (on 2.5 now) for qemu to take advantage of
>> this?
> 
> Getting p(read|write)v working almost doubled my virtio-net throughput in a 
> Linux guest. Not quite as much in Windows guests. Yes you need glibc-2.10. I 
> think some distros might have backported it to 2.9. You will also need some 
> support for it in your system includes.

Thanks, I will try a newer glibc, or maybe just move to a newer Linux 
installation which happens to have a newer glic.

-Andrew

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Iggy Jackson Sept. 4, 2009, 5:04 p.m. UTC | #25
On Friday 04 September 2009 11:08:51 am Andrew Theurer wrote:
> Brian Jackson wrote:
> > On Friday 04 September 2009 09:48:17 am Andrew Theurer wrote:
> > <snip>
> >
> >>> Still not idle=poll, it may shave off 0.2%.
> >>
> >> Won't this affect SMT in a negative way?  (OK, I am not running SMT now,
> >> but eventually we will be) A long time ago, we tested P4's with HT, and
> >> a polling idle in one thread always negatively impacted performance in
> >> the sibling thread.
> >>
> >> FWIW, I did try idle=halt, and it was slightly worse.
> >>
> >> I did get a chance to try the latest qemu (master and next heads).  I
> >> have been running into a problem with virtIO stor driver for windows on
> >> anything much newer than kvm-87.  I compiled the driver from the new git
> >> tree, installed OK, but still had the same error.  Finally, I removed
> >> the serial number feature in the virtio-blk in qemu, and I can now get
> >> the driver to work in Windows.
> >
> > What were the symptoms you were seeing (i.e. define "a problem").
> 
> Device manager reports "a problem code 10" occurred, and the driver
> cannot initialize.


Yes! I was getting this after I moved from 0.10.6 to 0.11.0-rc1. Now I know 
how to fix it. Thank you. Thank you.


> 
> Vadim Rozenfeld informed me:
> > There is a sanity check in the code, which checks the I/O range and fails
> > if is not equal to 40h. Resent virtio-blk devices have I/O range equal to
> > 0x400 (serial number feature). So, out signed  viostor driver will fail
> > on the latest KVMs. This problem was fixed
> 
> and committed to SVN some time ago.
> 
> I assumed the fix was to the virtio windows driver, but I could not get
> the driver I compiled from latest git to work either (only on
> qemu-kvm-87).  So, I just backed out the serial number feature in qemu,
> and it worked.  FWIW, the linux virtio-blk driver never had a problem.


There have been very few changes to the viostor windows git repo since it was 
opened. Unless it was done before they were open sourced. In any case, it 
doesn't seem to be working with what's publicly available, so I think maybe 
there is something missing internal to external.


> 
> >> So, not really any good news on performance with latest qemu builds.
> >> Performance is slightly worse:
> >>
> >> qemu-kvm-87
> >> user  nice  system   irq  softirq guest   idle  iowait
> >> 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
> >> total busy: 36.97
> >>
> >> qemu-kvm-88-905-g6025b2d (master)
> >> user  nice  system   irq  softirq guest   idle  iowait
> >> 6.57  0.00   10.86  0.08     1.02 21.35  55.90    4.21
> >> total busy: 39.89
> >>
> >> qemu-kvm-88-910-gbf8a05b (next)
> >> user  nice  system   irq  softirq guest   idle  iowait
> >> 6.60  0.00  10.91   0.09     1.03 21.35  55.71    4.31
> >> total busy: 39.98
> >>
> >> diff of profiles, p1=qemu-kvm-87, p2=qemu-master
> >
> > <snip>
> >
> >> 18x more samples for gfn_to_memslot_unali*, 37x for
> >> emulator_read_emula*, and more CPU time in guest mode.
> >>
> >> One other thing I decided to try was some cpu binding.  I know this is
> >> not practical for production, but I wanted to see if there's any benefit
> >> at all.  One reason was that a coworker here tried binding the qemu
> >> thread for the vcpu and the qemu IO thread to the same cpu.  On a
> >> networking test, guest->local-host, throughput was up about 2x.
> >> Obviously there was a nice effect of being on the same cache.  I
> >> wondered, even without full bore throughput tests, could we see any
> >> benefit here.  So, I bound each pair of VMs to a dedicated core.  What I
> >> saw was about a 6% improvement in performance.  For a system which has
> >> pretty incredible memory performance and is not that busy, I was
> >> surprised that I got 6%.  I am not advocating binding, but what I do
> >> wonder:  on 1-way VMs, if we keep all the qemu threads together on the
> >> same CPU, but still allowing the scheduler to move them (all of them at
> >> once) to different cpus over time, would we see the same benefit?
> >>
> >> One other thing:  So far I have not been using preadv/pwritev.  I assume
> >> I need a more recent glibc (on 2.5 now) for qemu to take advantage of
> >> this?
> >
> > Getting p(read|write)v working almost doubled my virtio-net throughput in
> > a Linux guest. Not quite as much in Windows guests. Yes you need
> > glibc-2.10. I think some distros might have backported it to 2.9. You
> > will also need some support for it in your system includes.
> 
> Thanks, I will try a newer glibc, or maybe just move to a newer Linux
> installation which happens to have a newer glic.


Fwiw... In Debian, I had to get glibc from the experimental tree. So some 
distros might not even have it. 


> 
> -Andrew
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vadim Rozenfeld Sept. 5, 2009, 11:34 a.m. UTC | #26
On 09/04/2009 08:04 PM, Brian Jackson wrote:
> On Friday 04 September 2009 11:08:51 am Andrew Theurer wrote:
>    
>> Brian Jackson wrote:
>>      
>>> On Friday 04 September 2009 09:48:17 am Andrew Theurer wrote:
>>> <snip>
>>>
>>>        
>>>>> Still not idle=poll, it may shave off 0.2%.
>>>>>            
>>>> Won't this affect SMT in a negative way?  (OK, I am not running SMT now,
>>>> but eventually we will be) A long time ago, we tested P4's with HT, and
>>>> a polling idle in one thread always negatively impacted performance in
>>>> the sibling thread.
>>>>
>>>> FWIW, I did try idle=halt, and it was slightly worse.
>>>>
>>>> I did get a chance to try the latest qemu (master and next heads).  I
>>>> have been running into a problem with virtIO stor driver for windows on
>>>> anything much newer than kvm-87.  I compiled the driver from the new git
>>>> tree, installed OK, but still had the same error.  Finally, I removed
>>>> the serial number feature in the virtio-blk in qemu, and I can now get
>>>> the driver to work in Windows.
>>>>          
>>> What were the symptoms you were seeing (i.e. define "a problem").
>>>        
>> Device manager reports "a problem code 10" occurred, and the driver
>> cannot initialize.
>>      
>
> Yes! I was getting this after I moved from 0.10.6 to 0.11.0-rc1. Now I know
> how to fix it. Thank you. Thank you.
>
>
>    
>> Vadim Rozenfeld informed me:
>>      
>>> There is a sanity check in the code, which checks the I/O range and fails
>>> if is not equal to 40h. Resent virtio-blk devices have I/O range equal to
>>> 0x400 (serial number feature). So, out signed  viostor driver will fail
>>> on the latest KVMs. This problem was fixed
>>>        
>> and committed to SVN some time ago.
>>
>> I assumed the fix was to the virtio windows driver, but I could not get
>> the driver I compiled from latest git to work either (only on
>> qemu-kvm-87).  So, I just backed out the serial number feature in qemu,
>> and it worked.  FWIW, the linux virtio-blk driver never had a problem.
>>      
>
> There have been very few changes to the viostor windows git repo since it was
> opened. Unless it was done before they were open sourced. In any case, it
> doesn't seem to be working with what's publicly available, so I think maybe
> there is something missing internal to external.
>    
I don't believe it was merged to git yet.
The signed viostor driver works with qemu-kvm-87 only,
otherwise  you need to remove SN from the virtio-blk code, or clip
IO range to the original size.

Cheers,
Vadim.
>
>    
>>      
>>>> So, not really any good news on performance with latest qemu builds.
>>>> Performance is slightly worse:
>>>>
>>>> qemu-kvm-87
>>>> user  nice  system   irq  softirq guest   idle  iowait
>>>> 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
>>>> total busy: 36.97
>>>>
>>>> qemu-kvm-88-905-g6025b2d (master)
>>>> user  nice  system   irq  softirq guest   idle  iowait
>>>> 6.57  0.00   10.86  0.08     1.02 21.35  55.90    4.21
>>>> total busy: 39.89
>>>>
>>>> qemu-kvm-88-910-gbf8a05b (next)
>>>> user  nice  system   irq  softirq guest   idle  iowait
>>>> 6.60  0.00  10.91   0.09     1.03 21.35  55.71    4.31
>>>> total busy: 39.98
>>>>
>>>> diff of profiles, p1=qemu-kvm-87, p2=qemu-master
>>>>          
>>> <snip>
>>>
>>>        
>>>> 18x more samples for gfn_to_memslot_unali*, 37x for
>>>> emulator_read_emula*, and more CPU time in guest mode.
>>>>
>>>> One other thing I decided to try was some cpu binding.  I know this is
>>>> not practical for production, but I wanted to see if there's any benefit
>>>> at all.  One reason was that a coworker here tried binding the qemu
>>>> thread for the vcpu and the qemu IO thread to the same cpu.  On a
>>>> networking test, guest->local-host, throughput was up about 2x.
>>>> Obviously there was a nice effect of being on the same cache.  I
>>>> wondered, even without full bore throughput tests, could we see any
>>>> benefit here.  So, I bound each pair of VMs to a dedicated core.  What I
>>>> saw was about a 6% improvement in performance.  For a system which has
>>>> pretty incredible memory performance and is not that busy, I was
>>>> surprised that I got 6%.  I am not advocating binding, but what I do
>>>> wonder:  on 1-way VMs, if we keep all the qemu threads together on the
>>>> same CPU, but still allowing the scheduler to move them (all of them at
>>>> once) to different cpus over time, would we see the same benefit?
>>>>
>>>> One other thing:  So far I have not been using preadv/pwritev.  I assume
>>>> I need a more recent glibc (on 2.5 now) for qemu to take advantage of
>>>> this?
>>>>          
>>> Getting p(read|write)v working almost doubled my virtio-net throughput in
>>> a Linux guest. Not quite as much in Windows guests. Yes you need
>>> glibc-2.10. I think some distros might have backported it to 2.9. You
>>> will also need some support for it in your system includes.
>>>        
>> Thanks, I will try a newer glibc, or maybe just move to a newer Linux
>> installation which happens to have a newer glic.
>>      
>
> Fwiw... In Debian, I had to get glibc from the experimental tree. So some
> distros might not even have it.
>
>
>    
>> -Andrew
>>
>>      
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 6, 2009, 8:21 a.m. UTC | #27
On 09/04/2009 05:48 PM, Andrew Theurer wrote:
>
>> Still not idle=poll, it may shave off 0.2%.
>>      
> Won't this affect SMT in a negative way?  (OK, I am not running SMT now,
> but eventually we will be) A long time ago, we tested P4's with HT, and
> a polling idle in one thread always negatively impacted performance in
> the sibling thread.
>    

Sorry, I meant idle=halt.  idle=poll is too wasteful to be used.

> FWIW, I did try idle=halt, and it was slightly worse.
>    

Interesting, I've heard that mwait latency is bad for spinlocks, guess 
it's fine for idle.

>    
>> profile1 is qemu-kvm-87
>> profile2 is qemu-master
>> Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (No unit mask) count 10000000
>> total samples (ts1) for profile1 is 1616921
>> total samples (ts2) for profile2 is 1752347 (includes multiplier of 0.995420)
>> functions which have a abs(pct2-pct1)<  0.06 are not displayed
>>
>>                                pct2:   pct1:
>>                                 100*    100*  pct2
>>         s1        s2   s2/s1  s2/ts1  s1/ts1  -pct1 symbol                     bin
>> --------- --------- ------- ------- ------- ------ ------                     ---
>>     879611    907883  1.03/1  56.149  54.400  1.749 vmx_vcpu_run               kvm
>>        614     11553 18.82/1   0.715   0.038  0.677 gfn_to_memslot_unali    kvm.ko
>>      34511     44922  1.30/1   2.778   2.134  0.644 phys_page_find_alloc      qemu
>>       2866      9334  3.26/1   0.577   0.177  0.400 paging64_walk_addr      kvm.ko
>>      11139     17200  1.54/1   1.064   0.689  0.375 copy_user_generic_st   vmlinux
>>       3100      7108  2.29/1   0.440   0.192  0.248 x86_decode_insn         kvm.ko
>>       8169     11873  1.45/1   0.734   0.505  0.229 virtqueue_avail_byte      qemu
>>       1103      4540  4.12/1   0.281   0.068  0.213 kvm_read_guest          kvm.ko
>>      17427     20401  1.17/1   1.262   1.078  0.184 memcpy                    libc
>>          0      2905           0.180   0.000  0.180 gfn_to_pfn              kvm.ko
>>       1831      4328  2.36/1   0.268   0.113  0.154 x86_emulate_insn        kvm.ko
>>         65      2431 37.41/1   0.150   0.004  0.146 emulator_read_emulat    kvm.ko
>>      14922     17196  1.15/1   1.064   0.923  0.141 qemu_get_ram_ptr          qemu
>>        545      2724  5.00/1   0.168   0.034  0.135 emulate_instruction     kvm.ko
>>        599      2464  4.11/1   0.152   0.037  0.115 kvm_read_guest_page     kvm.ko
>>        503      2355  4.68/1   0.146   0.031  0.115 gfn_to_hva              kvm.ko
>>       1076      2918  2.71/1   0.181   0.067  0.114 memcpy_c               vmlinux
>>        594      2241  3.77/1   0.139   0.037  0.102 next_segment            kvm.ko
>>       1680      3248  1.93/1   0.201   0.104  0.097 pipe_poll              vmlinux
>>          0      1463           0.090   0.000  0.090 subpage_readl             qemu
>>          0      1363           0.084   0.000  0.084 msix_enabled              qemu
>>        527      1883  3.57/1   0.116   0.033  0.084 paging64_gpte_to_gfn    kvm.ko
>>        962      2223  2.31/1   0.138   0.059  0.078 do_insn_fetch           kvm.ko
>>        348      1605  4.61/1   0.099   0.022  0.078 is_rsvd_bits_set        kvm.ko
>>        520      1763  3.39/1   0.109   0.032  0.077 unalias_gfn             kvm.ko
>>          1      1163 1163.65/1   0.072   0.000  0.072 tdp_page_fault          kvm.ko
>>       3827      4912  1.28/1   0.304   0.237  0.067 __down_read            vmlinux
>>          0      1014           0.063   0.000  0.063 mapping_level           kvm.ko
>>        973         0           0.000   0.060 -0.060 pm_ioport_readl           qemu
>>       1635       528  1/3.09   0.033   0.101 -0.068 ioport_read               qemu
>>       2179      1017  1/2.14   0.063   0.135 -0.072 kvm_emulate_pio         kvm.ko
>>      25141     23722  1/1.06   1.467   1.555 -0.088 native_write_msr_saf   vmlinux
>>       1560         0           0.000   0.096 -0.096 eventfd_poll           vmlinux
>>                              ------- ------- ------
>>                              105.100  97.450  7.650
>>      
>
> 18x more samples for gfn_to_memslot_unali*, 37x for
> emulator_read_emula*, and more CPU time in guest mode.
>    

And 5x more instructions emulated.  I wonder where that comes from.

> One other thing:  So far I have not been using preadv/pwritev.  I assume
> I need a more recent glibc (on 2.5 now) for qemu to take advantage of
> this?
>
>    

Yes, but it should be easy to write a LD_PRELOAD hack that will work 
with your current glibc.  It should certainly improve things.
Yan Vugenfirer Sept. 6, 2009, 12:49 p.m. UTC | #28
> -----Original Message-----
> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
> Behalf Of Vadim Rozenfeld
> Sent: Saturday, September 05, 2009 2:35 PM
> To: Brian Jackson
> Cc: Andrew Theurer; kvm@vger.kernel.org
> Subject: Re: [PATCH] KVM: Use thread debug register storage instead of
> kvm specific data
> 
> On 09/04/2009 08:04 PM, Brian Jackson wrote:
> > On Friday 04 September 2009 11:08:51 am Andrew Theurer wrote:
> >
> >> Brian Jackson wrote:
> >>
> >>> On Friday 04 September 2009 09:48:17 am Andrew Theurer wrote:
> >>> <snip>
> >>>
> >>>
> >>>>> Still not idle=poll, it may shave off 0.2%.
> >>>>>
> >>>> Won't this affect SMT in a negative way?  (OK, I am not running
> SMT now,
> >>>> but eventually we will be) A long time ago, we tested P4's with
> HT, and
> >>>> a polling idle in one thread always negatively impacted
> performance in
> >>>> the sibling thread.
> >>>>
> >>>> FWIW, I did try idle=halt, and it was slightly worse.
> >>>>
> >>>> I did get a chance to try the latest qemu (master and next heads).
> I
> >>>> have been running into a problem with virtIO stor driver for
> windows on
> >>>> anything much newer than kvm-87.  I compiled the driver from the
> new git
> >>>> tree, installed OK, but still had the same error.  Finally, I
> removed
> >>>> the serial number feature in the virtio-blk in qemu, and I can now
> get
> >>>> the driver to work in Windows.
> >>>>
> >>> What were the symptoms you were seeing (i.e. define "a problem").
> >>>
> >> Device manager reports "a problem code 10" occurred, and the driver
> >> cannot initialize.
> >>
> >
> > Yes! I was getting this after I moved from 0.10.6 to 0.11.0-rc1. Now
> I know
> > how to fix it. Thank you. Thank you.
> >
> >
> >
> >> Vadim Rozenfeld informed me:
> >>
> >>> There is a sanity check in the code, which checks the I/O range and
> fails
> >>> if is not equal to 40h. Resent virtio-blk devices have I/O range
> equal to
> >>> 0x400 (serial number feature). So, out signed  viostor driver will
> fail
> >>> on the latest KVMs. This problem was fixed
> >>>
> >> and committed to SVN some time ago.
> >>
> >> I assumed the fix was to the virtio windows driver, but I could not
> get
> >> the driver I compiled from latest git to work either (only on
> >> qemu-kvm-87).  So, I just backed out the serial number feature in
> qemu,
> >> and it worked.  FWIW, the linux virtio-blk driver never had a
> problem.
> >>
> >
> > There have been very few changes to the viostor windows git repo
> since it was
> > opened. Unless it was done before they were open sourced. In any
> case, it
> > doesn't seem to be working with what's publicly available, so I think
> maybe
> > there is something missing internal to external.
> >
> I don't believe it was merged to git yet.
> The signed viostor driver works with qemu-kvm-87 only,
> otherwise  you need to remove SN from the virtio-blk code, or clip
> IO range to the original size.
[YV] Please find attached patch with the fix. Kernel.org experiencing some
problems now. The moment they will be fixed, I will push this patch to it.
> 
> Cheers,
> Vadim.
> >
> >
> >>
> >>>> So, not really any good news on performance with latest qemu
> builds.
> >>>> Performance is slightly worse:
> >>>>
> >>>> qemu-kvm-87
> >>>> user  nice  system   irq  softirq guest   idle  iowait
> >>>> 5.79  0.00    9.28  0.08     1.00 20.81  58.78    4.26
> >>>> total busy: 36.97
> >>>>
> >>>> qemu-kvm-88-905-g6025b2d (master)
> >>>> user  nice  system   irq  softirq guest   idle  iowait
> >>>> 6.57  0.00   10.86  0.08     1.02 21.35  55.90    4.21
> >>>> total busy: 39.89
> >>>>
> >>>> qemu-kvm-88-910-gbf8a05b (next)
> >>>> user  nice  system   irq  softirq guest   idle  iowait
> >>>> 6.60  0.00  10.91   0.09     1.03 21.35  55.71    4.31
> >>>> total busy: 39.98
> >>>>
> >>>> diff of profiles, p1=qemu-kvm-87, p2=qemu-master
> >>>>
> >>> <snip>
> >>>
> >>>
> >>>> 18x more samples for gfn_to_memslot_unali*, 37x for
> >>>> emulator_read_emula*, and more CPU time in guest mode.
> >>>>
> >>>> One other thing I decided to try was some cpu binding.  I know
> this is
> >>>> not practical for production, but I wanted to see if there's any
> benefit
> >>>> at all.  One reason was that a coworker here tried binding the
> qemu
> >>>> thread for the vcpu and the qemu IO thread to the same cpu.  On a
> >>>> networking test, guest->local-host, throughput was up about 2x.
> >>>> Obviously there was a nice effect of being on the same cache.  I
> >>>> wondered, even without full bore throughput tests, could we see
> any
> >>>> benefit here.  So, I bound each pair of VMs to a dedicated core.
> What I
> >>>> saw was about a 6% improvement in performance.  For a system which
> has
> >>>> pretty incredible memory performance and is not that busy, I was
> >>>> surprised that I got 6%.  I am not advocating binding, but what I
> do
> >>>> wonder:  on 1-way VMs, if we keep all the qemu threads together on
> the
> >>>> same CPU, but still allowing the scheduler to move them (all of
> them at
> >>>> once) to different cpus over time, would we see the same benefit?
> >>>>
> >>>> One other thing:  So far I have not been using preadv/pwritev.  I
> assume
> >>>> I need a more recent glibc (on 2.5 now) for qemu to take advantage
> of
> >>>> this?
> >>>>
> >>> Getting p(read|write)v working almost doubled my virtio-net
> throughput in
> >>> a Linux guest. Not quite as much in Windows guests. Yes you need
> >>> glibc-2.10. I think some distros might have backported it to 2.9.
> You
> >>> will also need some support for it in your system includes.
> >>>
> >> Thanks, I will try a newer glibc, or maybe just move to a newer
> Linux
> >> installation which happens to have a newer glic.
> >>
> >
> > Fwiw... In Debian, I had to get glibc from the experimental tree. So
> some
> > distros might not even have it.
> >
> >
> >
> >> -Andrew
> >>
> >>
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6046e6f..45226f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,9 +362,6 @@  struct kvm_vcpu_arch {
 	u32 pat;
 
 	int switch_db_regs;
-	unsigned long host_db[KVM_NR_DB_REGS];
-	unsigned long host_dr6;
-	unsigned long host_dr7;
 	unsigned long db[KVM_NR_DB_REGS];
 	unsigned long dr6;
 	unsigned long dr7;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 891234b..9e3acbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3627,14 +3627,7 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	kvm_guest_enter();
 
-	get_debugreg(vcpu->arch.host_dr6, 6);
-	get_debugreg(vcpu->arch.host_dr7, 7);
 	if (unlikely(vcpu->arch.switch_db_regs)) {
-		get_debugreg(vcpu->arch.host_db[0], 0);
-		get_debugreg(vcpu->arch.host_db[1], 1);
-		get_debugreg(vcpu->arch.host_db[2], 2);
-		get_debugreg(vcpu->arch.host_db[3], 3);
-
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
 		set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -3645,15 +3638,14 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	trace_kvm_entry(vcpu->vcpu_id);
 	kvm_x86_ops->run(vcpu);
 
-	if (unlikely(vcpu->arch.switch_db_regs)) {
-		set_debugreg(0, 7);
-		set_debugreg(vcpu->arch.host_db[0], 0);
-		set_debugreg(vcpu->arch.host_db[1], 1);
-		set_debugreg(vcpu->arch.host_db[2], 2);
-		set_debugreg(vcpu->arch.host_db[3], 3);
+	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
+		set_debugreg(current->thread.debugreg0, 0);
+		set_debugreg(current->thread.debugreg1, 1);
+		set_debugreg(current->thread.debugreg2, 2);
+		set_debugreg(current->thread.debugreg3, 3);
+		set_debugreg(current->thread.debugreg6, 6);
+		set_debugreg(current->thread.debugreg7, 7);
 	}
-	set_debugreg(vcpu->arch.host_dr6, 6);
-	set_debugreg(vcpu->arch.host_dr7, 7);
 
 	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();