diff mbox

deal with interrupt shadow state for emulated instruction

Message ID 1239653210-10422-1-git-send-email-glommer@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Glauber Costa April 13, 2009, 8:06 p.m. UTC
we currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it. For sequences of two or more instructions of the same type
among those instructions, only the first one has this effect.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h        |    1 +
 arch/x86/include/asm/kvm_x86_emulate.h |    7 +++++
 arch/x86/kvm/svm.c                     |   19 +++++++++++++-
 arch/x86/kvm/vmx.c                     |   42 ++++++++++++++++++++++++-------
 arch/x86/kvm/x86.c                     |    6 ++++-
 arch/x86/kvm/x86_emulate.c             |   14 ++++++++++
 6 files changed, 76 insertions(+), 13 deletions(-)

Comments

Gleb Natapov April 14, 2009, 9:07 a.m. UTC | #1
On Mon, Apr 13, 2009 at 04:06:50PM -0400, Glauber Costa wrote:
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3fc4623..0db1be7 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -513,6 +513,7 @@ struct kvm_x86_ops {
>  	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
>  	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
>  	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
> +	void (*interrupt_shadow_mask)(struct kvm_vcpu *vcpu, int mask);

Note that this conflicts with my "Move interrupt injection logic to x86.c"
patch. It adds drop_interrupt_shadow callback. Your callback is more
general so it should replace mine when/if my patchset will go in.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 14, 2009, 9:34 a.m. UTC | #2
Glauber Costa wrote:
> we currently unblock shadow interrupt state when we skip an instruction,
> but failing to do so when we actually emulate one. This blocks interrupts
> in key instruction blocks, in particular sti; hlt; sequences
>
> If the instruction emulated is an sti, we have to block shadow interrupts.
> The same goes for mov ss. pop ss also needs it, but we don't currently
> emulate it. For sequences of two or more instructions of the same type
> among those instructions, only the first one has this effect.
>
> Without this patch, I cannot boot gpxe option roms at vmx machines.
> This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469
>
>   

We'll defer this until after Gleb's patchset, since that's much bigger.

> +#define X86_SHADOW_INT_MOV_SS	1
> +#define X86_SHADOW_INT_STI	2
> +
>  struct x86_emulate_ctxt {
>  	/* Register state before/after emulation. */
>  	struct kvm_vcpu *vcpu;
> @@ -152,6 +155,10 @@ struct x86_emulate_ctxt {
>  	int mode;
>  	u32 cs_base;
>  
> +	/* interruptibility state, as a result of execution of STI or MOV SS */
> +	int interruptibility;
> +	int movss_int_flag, movss_int_flag_old;
> +
>   

bit masks are traditionally unsigned.

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0bb4131..b1fc8b6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2364,7 +2364,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
>  			u16 error_code,
>  			int emulation_type)
>  {
> -	int r;
> +	int r, shadow_mask;
>  	struct decode_cache *c;
>  
>  	kvm_clear_exception_queue(vcpu);
> @@ -2412,8 +2412,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
>  		}
>  	}
>  
> +	vcpu->arch.emulate_ctxt.interruptibility = 0;
>  	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
>  
> +	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
> +	kvm_x86_ops->interrupt_shadow_mask(vcpu, shadow_mask);
> +
>   

Emulation may have failed, in which case you don't want to update the 
interrupt shadow mask.

>  	if (vcpu->arch.pio.string)
>  		return EMULATE_DO_MMIO;
>  
> diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
> index d7c9f6f..1369a2e 100644
> --- a/arch/x86/kvm/x86_emulate.c
> +++ b/arch/x86/kvm/x86_emulate.c
> @@ -1360,6 +1360,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
>  	int io_dir_in;
>  	int rc = 0;
>  
> +	ctxt->movss_int_flag_old = ctxt->movss_int_flag;
> +
> +	ctxt->movss_int_flag = 0;
>   

This seem to be internal to the emulator.  However, instructions may be 
executed outside the emulator, invalidating movss_int_flag.  But see below.

> @@ -1610,6 +1614,14 @@ special_insn:
>  
>  		sel = c->src.val;
>  		if (c->modrm_reg <= 5) {
> +			if (c->modrm_reg == VCPU_SREG_SS) {
> +				if (ctxt->movss_int_flag_old)
> +					ctxt->interruptibility |=
> +						X86_SHADOW_INT_MOV_SS;
> +				else
> +					ctxt->movss_int_flag = 1;
> +			}
>   

The comment about repeating 'mov ss' in the manual has that wonderful 
word in it, May.  That means we're perfectly allowed to ignore it and 
just set the flag unconditionally.

I doubt we'll ever see a repeated 'mov ss', once is more than enough.
H. Peter Anvin April 14, 2009, 4:07 p.m. UTC | #3
Avi Kivity wrote:
> 
> The comment about repeating 'mov ss' in the manual has that wonderful
> word in it, May.  That means we're perfectly allowed to ignore it and
> just set the flag unconditionally.
> 

Realistically, though, this should only be done for a limited number of
sequential instructions.

> I doubt we'll ever see a repeated 'mov ss', once is more than enough.

True enough, except maliciously.

	-hpa
Avi Kivity April 14, 2009, 4:14 p.m. UTC | #4
H. Peter Anvin wrote:
> Avi Kivity wrote:
>   
>> The comment about repeating 'mov ss' in the manual has that wonderful
>> word in it, May.  That means we're perfectly allowed to ignore it and
>> just set the flag unconditionally.
>>
>>     
>
> Realistically, though, this should only be done for a limited number of
> sequential instructions.
>
>   

Why?  Do you see a guest filling all of memory with 'mov ss' and 
expecting to break out of it via an interrupt?

>> I doubt we'll ever see a repeated 'mov ss', once is more than enough.
>>     
>
> True enough, except maliciously.
>   

Why do we care?  The guest can only harm itself, and if it wants to 
disable interrupts, it would be a lot easier for it to run a plain 'cli'.

I guess it would be a problem if we emulated 'mov ss' for ordinary 
userspace or vm86 mode, but we don't.
H. Peter Anvin April 14, 2009, 4:25 p.m. UTC | #5
Avi Kivity wrote:
> 
> Why do we care?  The guest can only harm itself, and if it wants to
> disable interrupts, it would be a lot easier for it to run a plain 'cli'.
> 
> I guess it would be a problem if we emulated 'mov ss' for ordinary
> userspace or vm86 mode, but we don't.
> 

Well, the answer is that mov ss is an unprivileged instruction.

	-hpa
Alan Cox April 14, 2009, 5:31 p.m. UTC | #6
> Why?  Do you see a guest filling all of memory with 'mov ss' and 
> expecting to break out of it via an interrupt?

Well I did try mapping a page of move ss all through memory on real
hardware long ago and seeing what happened on a 386 in real mode with
DOSEMU. I was disappointed ;)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin April 14, 2009, 5:32 p.m. UTC | #7
Alan Cox wrote:
>> Why?  Do you see a guest filling all of memory with 'mov ss' and 
>> expecting to break out of it via an interrupt?
> 
> Well I did try mapping a page of move ss all through memory on real
> hardware long ago and seeing what happened on a 386 in real mode with
> DOSEMU. I was disappointed ;)
> 

Heheheheh....

	-hpa
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 16, 2009, 9:18 a.m. UTC | #8
H. Peter Anvin wrote:
> Avi Kivity wrote:
>   
>> Why do we care?  The guest can only harm itself, and if it wants to
>> disable interrupts, it would be a lot easier for it to run a plain 'cli'.
>>
>> I guess it would be a problem if we emulated 'mov ss' for ordinary
>> userspace or vm86 mode, but we don't.
>>
>>     
>
> Well, the answer is that mov ss is an unprivileged instruction.
>
>   

We don't emulate guest user mode.

Well, if guest userspace can convince its kernel to give it access to 
some memory mapped I/O register, I guess it can execute repeated 'mov 
ss, mmio' and starve the guest kernel.
H. Peter Anvin April 16, 2009, 10:40 p.m. UTC | #9
Avi Kivity wrote:
> 
> We don't emulate guest user mode.
> 
> Well, if guest userspace can convince its kernel to give it access to 
> some memory mapped I/O register, I guess it can execute repeated 'mov 
> ss, mmio' and starve the guest kernel.
> 

It doesn't need a MMIO register to do that, even.

	-hpa
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 19, 2009, 8:26 a.m. UTC | #10
H. Peter Anvin wrote:
> Avi Kivity wrote:
>>
>> We don't emulate guest user mode.
>>
>> Well, if guest userspace can convince its kernel to give it access to 
>> some memory mapped I/O register, I guess it can execute repeated 'mov 
>> ss, mmio' and starve the guest kernel.
>>
>
> It doesn't need a MMIO register to do that, even.
>

Can you explain how?
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fc4623..0db1be7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -513,6 +513,7 @@  struct kvm_x86_ops {
 	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+	void (*interrupt_shadow_mask)(struct kvm_vcpu *vcpu, int mask);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
 	int (*get_irq)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a15973..6c15498 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@  struct decode_cache {
 	struct fetch_cache fetch;
 };
 
+#define X86_SHADOW_INT_MOV_SS	1
+#define X86_SHADOW_INT_STI	2
+
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
@@ -152,6 +155,10 @@  struct x86_emulate_ctxt {
 	int mode;
 	u32 cs_base;
 
+	/* interruptibility state, as a result of execution of STI or MOV SS */
+	int interruptibility;
+	int movss_int_flag, movss_int_flag_old;
+
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3ffb695..f41cb08 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -210,6 +210,21 @@  static int is_external_interrupt(u32 info)
 	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 }
 
+static void svm_interrupt_shadow_mask(struct kvm_vcpu *vcpu, int mask)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (mask == 0)
+		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+	else
+		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+	svm->vcpu.arch.interrupt_window_open =
+		(!(svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -223,9 +238,8 @@  static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		       __func__, kvm_rip_read(vcpu), svm->next_rip);
 
 	kvm_rip_write(vcpu, svm->next_rip);
-	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
-	vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
+	svm_interrupt_shadow_mask(vcpu, 0);
 }
 
 static int has_svm(void)
@@ -2660,6 +2674,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.interrupt_shadow_mask = svm_interrupt_shadow_mask,
 	.patch_hypercall = svm_patch_hypercall,
 	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6997c0..07b0203 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -736,26 +736,47 @@  static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static void vmx_interrupt_shadow_mask(struct kvm_vcpu *vcpu, int mask)
+{
+	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	u32 interruptibility = interruptibility_old;
+
+	switch (mask) {
+	case 0:
+		interruptibility &= ~((GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+		break;
+	case X86_SHADOW_INT_MOV_SS:
+		interruptibility |= GUEST_INTR_STATE_MOV_SS;
+		break;
+	case X86_SHADOW_INT_STI:
+		interruptibility |= GUEST_INTR_STATE_STI;
+		break;
+	default:
+		printk(KERN_ERR "Bogus mask for interrupt shadow!\n");
+	}
+
+	if ((interruptibility != interruptibility_old))
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+
+	vcpu->arch.interrupt_window_open =
+		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		 !(interruptibility & (GUEST_INTR_STATE_STI |
+				       GUEST_INTR_STATE_MOV_SS)));
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	unsigned long rip;
-	u32 interruptibility;
 
 	rip = kvm_rip_read(vcpu);
 	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	kvm_rip_write(vcpu, rip);
 
-	/*
-	 * We emulated an instruction, so temporary interrupt blocking
-	 * should be removed, if set.
-	 */
-	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	if (interruptibility & 3)
-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-			     interruptibility & ~3);
-	vcpu->arch.interrupt_window_open = 1;
+	/* skipping an emulated instruction also counts */
+	vmx_interrupt_shadow_mask(vcpu, 0);
 }
 
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code)
 {
@@ -3727,6 +3748,7 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.interrupt_shadow_mask = vmx_interrupt_shadow_mask,
 	.patch_hypercall = vmx_patch_hypercall,
 	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0bb4131..b1fc8b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2364,7 +2364,7 @@  int emulate_instruction(struct kvm_vcpu *vcpu,
 			u16 error_code,
 			int emulation_type)
 {
-	int r;
+	int r, shadow_mask;
 	struct decode_cache *c;
 
 	kvm_clear_exception_queue(vcpu);
@@ -2412,8 +2412,12 @@  int emulate_instruction(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	vcpu->arch.emulate_ctxt.interruptibility = 0;
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+	kvm_x86_ops->interrupt_shadow_mask(vcpu, shadow_mask);
+
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
 
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d7c9f6f..1369a2e 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1360,6 +1360,10 @@  x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int io_dir_in;
 	int rc = 0;
 
+	ctxt->movss_int_flag_old = ctxt->movss_int_flag;
+
+	ctxt->movss_int_flag = 0;
+
 	/* Shadow copy of register state. Committed on successful emulation.
 	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
 	 * modify them.
@@ -1610,6 +1614,14 @@  special_insn:
 
 		sel = c->src.val;
 		if (c->modrm_reg <= 5) {
+			if (c->modrm_reg == VCPU_SREG_SS) {
+				if (ctxt->movss_int_flag_old)
+					ctxt->interruptibility |=
+						X86_SHADOW_INT_MOV_SS;
+				else
+					ctxt->movss_int_flag = 1;
+			}
+
 			type_bits = (c->modrm_reg == 1) ? 9 : 1;
 			err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
 							  type_bits, c->modrm_reg);
@@ -1864,6 +1876,8 @@  special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfb: /* sti */
+		if (!(ctxt->eflags & X86_EFLAGS_IF))
+			ctxt->interruptibility |= X86_SHADOW_INT_STI;
 		ctxt->eflags |= X86_EFLAGS_IF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;