diff mbox

KVM: Improvements for task switching

Message ID loom.20090312T170331-256@post.gmane.org (mailing list archive)
State New, archived
Headers show

Commit Message

Bernhard Kohl March 12, 2009, 5:06 p.m. UTC
NSN's proprietary OS DMX sometimes does task switches.
To get it running in KVM the following changes were necessary:
Interrupt injection only with interrupt flag set.
Linking the tss->prev_task_link to itself removed.
Task linking is required for CALL and GATE.
Do not call skip_emulated_instruction() for GATE.

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
---
 arch/x86/kvm/vmx.c |    3 ++-
 arch/x86/kvm/x86.c |   19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

 
@@ -3882,10 +3895,12 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16
tss_selector, int reason)
 		kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
 	}
 
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	if (reason != TASK_SWITCH_GATE)
+		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 	if (nseg_desc.type & 8)
 		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
+ 					 old_tss_sel, reason,
 					 &nseg_desc);
 	else
 		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,

Comments

Jan Kiszka March 12, 2009, 6:43 p.m. UTC | #1
Bernhard Kohl wrote:
> NSN's proprietary OS DMX sometimes does task switches.
> To get it running in KVM the following changes were necessary:
> Interrupt injection only with interrupt flag set.
> Linking the tss->prev_task_link to itself removed.
> Task linking is required for CALL and GATE.
> Do not call skip_emulated_instruction() for GATE.

Please post independent changes as separate patches. I guess the task
linking changes belong together, but surely not to the IRQ injection
patch. And the last change looks independent, too.

Another wish (specifically as this is tricky stuff): also describe in
the commit log, why you changed something.

> 
> Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
> ---
>  arch/x86/kvm/vmx.c |    3 ++-
>  arch/x86/kvm/x86.c |   19 +++++++++++++++++--
>  2 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 5cf28df..eca57a3 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3357,7 +3357,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
>  			enable_irq_window(vcpu);
>  	}
>  	if (vcpu->arch.interrupt.pending) {
> -		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
> +		if (vcpu->arch.interrupt_window_open)
> +			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
>  		if (kvm_cpu_has_interrupt(vcpu))
>  			enable_irq_window(vcpu);
>  	}

That causes concerns on my side as we had a hard time stabilizing this
code. Need to think about it. Do you happen to have a test case for this
(if it's not publicly shareable, contact me directly)? Did you check
that this change causes no obvious regressions to other guests? What
about the user-inject IRQ case, does it already work for you as-is?

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b556b6a..9052058 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3683,7 +3683,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
>  	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
>  	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
>  	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
> -	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
> +	tss->prev_task_link = 0;
>  }
>  
>  static int load_state_from_tss32(struct kvm_vcpu *vcpu,
> @@ -3810,6 +3810,7 @@ out:
>  
>  static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
>  		       u32 old_tss_base,
> +		       u16 old_tss_selector, int reason,
>  		       struct desc_struct *nseg_desc)
>  {
>  	struct tss_segment_32 tss_segment_32;

What about 16-bit switches, are they already correct?

> @@ -3829,6 +3830,18 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16
> tss_selector,
>  			   &tss_segment_32, sizeof tss_segment_32))
>  		goto out;
>  
> +	/*
> +	 * SDM 3: table 6-2
> +	 * Task linking required for CALL and GATE.
> +	 */
> +	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
> +	{
> +		tss_segment_32.prev_task_link = old_tss_selector;
> +		kvm_write_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
> +				&tss_segment_32, sizeof(struct tss_segment_32));
> +
> +	}
> +
>  	if (load_state_from_tss32(vcpu, &tss_segment_32))
>  		goto out;
>  
> @@ -3882,10 +3895,12 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16
> tss_selector, int reason)
>  		kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
>  	}
>  
> -	kvm_x86_ops->skip_emulated_instruction(vcpu);
> +	if (reason != TASK_SWITCH_GATE)
> +		kvm_x86_ops->skip_emulated_instruction(vcpu);
>  
>  	if (nseg_desc.type & 8)
>  		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
> + 					 old_tss_sel, reason,
>  					 &nseg_desc);
>  	else
>  		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,

Jan
Jan Kiszka March 12, 2009, 7:12 p.m. UTC | #2
Jan Kiszka wrote:
> Bernhard Kohl wrote:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 5cf28df..eca57a3 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -3357,7 +3357,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
>>  			enable_irq_window(vcpu);
>>  	}
>>  	if (vcpu->arch.interrupt.pending) {
>> -		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
>> +		if (vcpu->arch.interrupt_window_open)
>> +			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
>>  		if (kvm_cpu_has_interrupt(vcpu))
>>  			enable_irq_window(vcpu);
>>  	}
> 
> That causes concerns on my side as we had a hard time stabilizing this
> code. Need to think about it. Do you happen to have a test case for this
> (if it's not publicly shareable, contact me directly)? Did you check
> that this change causes no obvious regressions to other guests? What
> about the user-inject IRQ case, does it already work for you as-is?

Hmm, do_interrupt_requests will most likely not cause troubles as it
both pends and injects interrupts only when the window if open. I don't
get the scenario behind this here yet, but I think it would be a very
good chance to align the code layout of vmx_intr_assist to
do_interrupt_requests in this respect, either finally de-optimizing or
even breaking both :) - or bringing them in the same correct form.

Jan
Bernhard Kohl March 13, 2009, 2:17 p.m. UTC | #3
Jan Kiszka <jan.kiszka <at> siemens.com> writes:

> 
> Bernhard Kohl wrote:
> > NSN's proprietary OS DMX sometimes does task switches.
> > To get it running in KVM the following changes were necessary:
> > Interrupt injection only with interrupt flag set.
> > Linking the tss->prev_task_link to itself removed.
> > Task linking is required for CALL and GATE.
> > Do not call skip_emulated_instruction() for GATE.
> 
> Please post independent changes as separate patches. I guess the task
> linking changes belong together, but surely not to the IRQ injection
> patch. And the last change looks independent, too.

From my point of view it is one patch. The DMX OS crashed during its task
switch. After fixing the first problem we got the 2nd, then the 3rd and 4th.
It can only complete a complete task switch with all this fixed. Obviously
all other guests don't do this kind of task switches.

> 
> Another wish (specifically as this is tricky stuff): also describe in
> the commit log, why you changed something.

OK, I will do that.

> 
> That causes concerns on my side as we had a hard time stabilizing this
> code. Need to think about it. Do you happen to have a test case for this
> (if it's not publicly shareable, contact me directly)? Did you check
> that this change causes no obvious regressions to other guests? What
> about the user-inject IRQ case, does it already work for you as-is?

The test case is our DMX OS (no public availability). Without these changes it
crashes the VM. No known other problems. Linux guests run well with these
changes. Others not tested.

> 
> What about 16-bit switches, are they already correct?

Maybe similar changes are needed for 16-bit switches. DMX does not do that.
So I have no guest to test this.

Bernhard


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 13, 2009, 3:17 p.m. UTC | #4
Bernhard Kohl wrote:
> Jan Kiszka <jan.kiszka <at> siemens.com> writes:
> 
>> Bernhard Kohl wrote:
>>> NSN's proprietary OS DMX sometimes does task switches.
>>> To get it running in KVM the following changes were necessary:
>>> Interrupt injection only with interrupt flag set.
>>> Linking the tss->prev_task_link to itself removed.
>>> Task linking is required for CALL and GATE.
>>> Do not call skip_emulated_instruction() for GATE.
>> Please post independent changes as separate patches. I guess the task
>> linking changes belong together, but surely not to the IRQ injection
>> patch. And the last change looks independent, too.
> 
> From my point of view it is one patch. The DMX OS crashed during its task
> switch. After fixing the first problem we got the 2nd, then the 3rd and 4th.
> It can only complete a complete task switch with all this fixed. Obviously
> all other guests don't do this kind of task switches.

Let's consider some hypothetic guest that gets unhappy about the 4th
change but would be fine with the other three - in order to find the
origin of the regression more quickly, one needs separate patches that
can be reverted and re-applied one-by-one. Look at this from a higher
POV, not just from your guest's perspective.

> 
>> Another wish (specifically as this is tricky stuff): also describe in
>> the commit log, why you changed something.
> 
> OK, I will do that.
> 
>> That causes concerns on my side as we had a hard time stabilizing this
>> code. Need to think about it. Do you happen to have a test case for this
>> (if it's not publicly shareable, contact me directly)? Did you check
>> that this change causes no obvious regressions to other guests? What
>> about the user-inject IRQ case, does it already work for you as-is?
> 
> The test case is our DMX OS (no public availability). Without these changes it
> crashes the VM.

How did you debug the irq injection bug? Can you explain the scenario
which finally leads to your guest crash?

Normally, some to-be-injected IRQ is marked pending first when the IRQ
window is open and it is then immediately injected. That may fail, the
failure resolution is started, and then the still pending IRQ is
re-injected. I'm interested in that failure, and why the IRQ window
state changed after fixing up. Maybe it is a specific property of your
OS. See, I'm a fan of understanding what went wrong before patching it. :)

> No known other problems. Linux guests run well with these
> changes. Others not tested.

Meanwhile I also think that this particular change should not cause
regressions.

> 
>> What about 16-bit switches, are they already correct?
> 
> Maybe similar changes are needed for 16-bit switches. DMX does not do that.
> So I have no guest to test this.

At least you could try to apply your findings in an analogous way to the
16-bit case. Note in the change log that there is no test case yet and
let us wait for someone else to come around and stress it (which
probably means that we had no user for that use case so far anyway).

Jan
Bernhard Kohl March 13, 2009, 3:55 p.m. UTC | #5
Jan Kiszka Wrote:
> 
> Bernhard Kohl wrote:
> > Jan Kiszka <jan.kiszka <at> siemens.com> writes:
> > 
> >> Bernhard Kohl wrote:
> >>> NSN's proprietary OS DMX sometimes does task switches.
> >>> To get it running in KVM the following changes were necessary:
> >>> Interrupt injection only with interrupt flag set.
> >>> Linking the tss->prev_task_link to itself removed.
> >>> Task linking is required for CALL and GATE.
> >>> Do not call skip_emulated_instruction() for GATE.
> >> Please post independent changes as separate patches. I 
> guess the task
> >> linking changes belong together, but surely not to the IRQ 
> injection
> >> patch. And the last change looks independent, too.
> > 
> > From my point of view it is one patch. The DMX OS crashed 
> during its task
> > switch. After fixing the first problem we got the 2nd, then 
> the 3rd and 4th.
> > It can only complete a complete task switch with all this 
> fixed. Obviously
> > all other guests don't do this kind of task switches.
> 
> Let's consider some hypothetic guest that gets unhappy about the 4th
> change but would be fine with the other three - in order to find the
> origin of the regression more quickly, one needs separate patches that
> can be reverted and re-applied one-by-one. Look at this from a higher
> POV, not just from your guest's perspective.

OK, after the discussion has finished, I will submit separate patches.

> 
> > 
> >> Another wish (specifically as this is tricky stuff): also 
> describe in
> >> the commit log, why you changed something.
> > 
> > OK, I will do that.
> > 
> >> That causes concerns on my side as we had a hard time 
> stabilizing this
> >> code. Need to think about it. Do you happen to have a test 
> case for this
> >> (if it's not publicly shareable, contact me directly)? Did 
> you check
> >> that this change causes no obvious regressions to other 
> guests? What
> >> about the user-inject IRQ case, does it already work for you as-is?
> > 
> > The test case is our DMX OS (no public availability). 
> Without these changes it
> > crashes the VM.
> 
> How did you debug the irq injection bug? Can you explain the scenario
> which finally leads to your guest crash?

Actually my colleague Thomas did the debugging. Thomas, please describe
the details!

> 
> Normally, some to-be-injected IRQ is marked pending first when the IRQ
> window is open and it is then immediately injected. That may fail, the
> failure resolution is started, and then the still pending IRQ is
> re-injected. I'm interested in that failure, and why the IRQ window
> state changed after fixing up. Maybe it is a specific property of your
> OS. See, I'm a fan of understanding what went wrong before 
> patching it. :)
> 
> > No known other problems. Linux guests run well with these
> > changes. Others not tested.
> 
> Meanwhile I also think that this particular change should not cause
> regressions.
> 
> > 
> >> What about 16-bit switches, are they already correct?
> > 
> > Maybe similar changes are needed for 16-bit switches. DMX 
> does not do that.
> > So I have no guest to test this.
> 
> At least you could try to apply your findings in an analogous 
> way to the
> 16-bit case. Note in the change log that there is no test case yet and
> let us wait for someone else to come around and stress it (which
> probably means that we had no user for that use case so far anyway).

Thomas, can you do that? I'm on vacation next week. After that I will
post the final result as a new patch set.

Bernhard
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka March 13, 2009, 3:58 p.m. UTC | #6
Kohl, Bernhard (NSN - DE/Munich) wrote:
> Jan Kiszka Wrote:
>> Bernhard Kohl wrote:
>>> Jan Kiszka <jan.kiszka <at> siemens.com> writes:
>>>
>>>> Bernhard Kohl wrote:
>>>>> NSN's proprietary OS DMX sometimes does task switches.
>>>>> To get it running in KVM the following changes were necessary:
>>>>> Interrupt injection only with interrupt flag set.
>>>>> Linking the tss->prev_task_link to itself removed.
>>>>> Task linking is required for CALL and GATE.
>>>>> Do not call skip_emulated_instruction() for GATE.
>>>> Please post independent changes as separate patches. I 
>> guess the task
>>>> linking changes belong together, but surely not to the IRQ 
>> injection
>>>> patch. And the last change looks independent, too.
>>> From my point of view it is one patch. The DMX OS crashed 
>> during its task
>>> switch. After fixing the first problem we got the 2nd, then 
>> the 3rd and 4th.
>>> It can only complete a complete task switch with all this 
>> fixed. Obviously
>>> all other guests don't do this kind of task switches.
>> Let's consider some hypothetic guest that gets unhappy about the 4th
>> change but would be fine with the other three - in order to find the
>> origin of the regression more quickly, one needs separate patches that
>> can be reverted and re-applied one-by-one. Look at this from a higher
>> POV, not just from your guest's perspective.
> 
> OK, after the discussion has finished, I will submit separate patches.
> 
>>>> Another wish (specifically as this is tricky stuff): also 
>> describe in
>>>> the commit log, why you changed something.
>>> OK, I will do that.
>>>
>>>> That causes concerns on my side as we had a hard time 
>> stabilizing this
>>>> code. Need to think about it. Do you happen to have a test 
>> case for this
>>>> (if it's not publicly shareable, contact me directly)? Did 
>> you check
>>>> that this change causes no obvious regressions to other 
>> guests? What
>>>> about the user-inject IRQ case, does it already work for you as-is?
>>> The test case is our DMX OS (no public availability). 
>> Without these changes it
>>> crashes the VM.
>> How did you debug the irq injection bug? Can you explain the scenario
>> which finally leads to your guest crash?
> 
> Actually my colleague Thomas did the debugging. Thomas, please describe
> the details!
> 
>> Normally, some to-be-injected IRQ is marked pending first when the IRQ
>> window is open and it is then immediately injected. That may fail, the
>> failure resolution is started, and then the still pending IRQ is
>> re-injected. I'm interested in that failure, and why the IRQ window
>> state changed after fixing up. Maybe it is a specific property of your
>> OS. See, I'm a fan of understanding what went wrong before 
>> patching it. :)
>>
>>> No known other problems. Linux guests run well with these
>>> changes. Others not tested.
>> Meanwhile I also think that this particular change should not cause
>> regressions.
>>
>>>> What about 16-bit switches, are they already correct?
>>> Maybe similar changes are needed for 16-bit switches. DMX 
>> does not do that.
>>> So I have no guest to test this.
>> At least you could try to apply your findings in an analogous 
>> way to the
>> 16-bit case. Note in the change log that there is no test case yet and
>> let us wait for someone else to come around and stress it (which
>> probably means that we had no user for that use case so far anyway).
> 
> Thomas, can you do that? I'm on vacation next week. After that I will
> post the final result as a new patch set.
> 

Great, thanks in advance!

Jan
Andi Kleen March 18, 2009, 12:38 a.m. UTC | #7
Jan Kiszka <jan.kiszka@siemens.com> writes:
>
> At least you could try to apply your findings in an analogous way to the
> 16-bit case. Note in the change log that there is no test case yet and
> let us wait for someone else to come around and stress it (which
> probably means that we had no user for that use case so far anyway).

AFAIK the standard test case for really obscure x86 features like this is
OS/2. It would be good to check with that, especially older versions.

-Andi
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Julian Stecklina March 23, 2009, 6:15 p.m. UTC | #8
"Kohl, Bernhard (NSN - DE/Munich)" <bernhard.kohl@nsn.com> writes:

> Jan Kiszka Wrote:
[...]
> OK, after the discussion has finished, I will submit separate patches.

Is there any progress on this? I've been using this patch for several
days now with no ill effects.

The patch fixes Bug 2681442 for me:
https://sourceforge.net/tracker/?func=detail&atid=893831&aid=2681442&group_id=180599

Regards,
Julian Stecklina April 12, 2009, 5:31 p.m. UTC | #9
Julian Stecklina <js@alien8.de> writes:

> "Kohl, Bernhard (NSN - DE/Munich)" <bernhard.kohl@nsn.com> writes:
>
>> Jan Kiszka Wrote:
> [...]
>> OK, after the discussion has finished, I will submit separate patches.
>
> Is there any progress on this? I've been using this patch for several
> days now with no ill effects.
>
> The patch fixes Bug 2681442 for me:
> https://sourceforge.net/tracker/?func=detail&atid=893831&aid=2681442&group_id=180599

The bug is fixed in the upcoming KVM 85.

Regards,
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5cf28df..eca57a3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3357,7 +3357,8 @@  static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 			enable_irq_window(vcpu);
 	}
 	if (vcpu->arch.interrupt.pending) {
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		if (vcpu->arch.interrupt_window_open)
+			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
 		if (kvm_cpu_has_interrupt(vcpu))
 			enable_irq_window(vcpu);
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b556b6a..9052058 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3683,7 +3683,7 @@  static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
 	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
 	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+	tss->prev_task_link = 0;
 }
 
 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3810,6 +3810,7 @@  out:
 
 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
 		       u32 old_tss_base,
+		       u16 old_tss_selector, int reason,
 		       struct desc_struct *nseg_desc)
 {
 	struct tss_segment_32 tss_segment_32;
@@ -3829,6 +3830,18 @@  static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16
tss_selector,
 			   &tss_segment_32, sizeof tss_segment_32))
 		goto out;
 
+	/*
+	 * SDM 3: table 6-2
+	 * Task linking required for CALL and GATE.
+	 */
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+	{
+		tss_segment_32.prev_task_link = old_tss_selector;
+		kvm_write_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+				&tss_segment_32, sizeof(struct tss_segment_32));
+
+	}
+
 	if (load_state_from_tss32(vcpu, &tss_segment_32))
 		goto out;