diff mbox series

[3/4] KVM: x86: correctly merge pending and injected exception

Message ID 20210401143817.1030695-4-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: nSVM/nVMX: fix nested virtualization treatment of nested exceptions | expand

Commit Message

Maxim Levitsky April 1, 2021, 2:38 p.m. UTC
Allow the pending and the injected exceptions to co-exist
when they are raised.

Add 'kvm_deliver_pending_exception' function which 'merges' the pending
and injected exception or delivers a VM exit with both for a case when
the L1 intercepts the pending exception.

The later is done by vendor code using new nested callback
'deliver_exception_as_vmexit'

The kvm_deliver_pending_exception is called after each VM exit,
and prior to VM entry which ensures that during userspace VM exits,
only injected exception can be in a raised state.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   9 ++
 arch/x86/kvm/svm/nested.c       |  27 ++--
 arch/x86/kvm/svm/svm.c          |   2 +-
 arch/x86/kvm/vmx/nested.c       |  58 ++++----
 arch/x86/kvm/vmx/vmx.c          |   2 +-
 arch/x86/kvm/x86.c              | 233 ++++++++++++++++++--------------
 6 files changed, 181 insertions(+), 150 deletions(-)

Comments

Paolo Bonzini April 1, 2021, 7:56 p.m. UTC | #1
On 01/04/21 16:38, Maxim Levitsky wrote:
> +static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu)
> +{
> +	int class1, class2, ret;
> +
> +	/* try to deliver current pending exception as VM exit */
> +	if (is_guest_mode(vcpu)) {
> +		ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
> +		if (ret || !vcpu->arch.pending_exception.valid)
> +			return ret;
> +	}
> +
> +	/* No injected exception, so just deliver the payload and inject it */
> +	if (!vcpu->arch.injected_exception.valid) {
> +		trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
> +					vcpu->arch.pending_exception.has_error_code,
> +					vcpu->arch.pending_exception.error_code);
> +queue:

If you move the queue label to the top of the function, you can "goto queue" for #DF as well and you don't need to call kvm_do_deliver_pending_exception again.  In fact you can merge this function and kvm_deliver_pending_exception completely:


static int kvm_deliver_pending_exception_as_vmexit(struct kvm_vcpu *vcpu)
{
	WARN_ON(!vcpu->arch.pending_exception.valid);
	if (is_guest_mode(vcpu))
		return kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
	else
		return 0;
}

static int kvm_merge_injected_exception(struct kvm_vcpu *vcpu)
{
	/*
	 * First check if the pending exception takes precedence
	 * over the injected one, which will be reported in the
	 * vmexit info.
	 */
	ret = kvm_deliver_pending_exception_as_vmexit(vcpu);
	if (ret || !vcpu->arch.pending_exception.valid)
		return ret;

	if (vcpu->arch.injected_exception.nr == DF_VECTOR) {
		...
		return 0;
	}
	...
	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
	    || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
		...
	}
	vcpu->arch.injected_exception.valid = false;
}

static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
{
	if (!vcpu->arch.pending_exception.valid)
		return 0;

	if (vcpu->arch.injected_exception.valid)
		kvm_merge_injected_exception(vcpu);

	ret = kvm_deliver_pending_exception_as_vmexit(vcpu));
	if (ret || !vcpu->arch.pending_exception.valid)
		return ret;

	trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
				vcpu->arch.pending_exception.has_error_code,
				vcpu->arch.pending_exception.error_code);
	...
}

Note that if the pending exception is a page fault, its payload
must be delivered to CR2 before converting it to a double fault.

Going forward to vmx.c:

> 
>  	if (mtf_pending) {
>  		if (block_nested_events)
>  			return -EBUSY;
> +
>  		nested_vmx_update_pending_dbg(vcpu);

Should this instead "WARN_ON(vmx_pending_dbg_trap(vcpu));" since
the pending-#DB-plus-MTF combination is handled in
vmx_deliver_exception_as_vmexit?...

> 
> +
> +	if (vmx->nested.mtf_pending && vmx_pending_dbg_trap(vcpu)) {
> +		/*
> +		 * A pending monitor trap takes precedence over pending
> +		 * debug exception which is 'stashed' into
> +		 * 'GUEST_PENDING_DBG_EXCEPTIONS'
> +		 */
> +
> +		nested_vmx_update_pending_dbg(vcpu);
> +		vmx->nested.mtf_pending = false;
> +		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
> +		return 0;
> +	}

... though this is quite ugly, even more so if you add the case of an
INIT with a pending #DB.  The problem is that INIT and MTF have higher
priority than debug exceptions.

The good thing is that an INIT or MTF plus #DB cannot happen with
nested_run_pending == 1, so it will always be inject right away.

There is precedent with KVM_GET_* modifying the CPU state; in
particular, KVM_GET_MPSTATE can modify CS and RIP and even cause a
vmexit via kvm_apic_accept_events.  And in fact, because
kvm_apic_accept_events calls kvm_check_nested_events, calling it
from KVM_GET_VCPU_EVENTS would fix the problem: the injected exception
would go into the IDT-vectored exit field, while the pending exception
would go into GUEST_PENDING_DBG_EXCEPTION and disappear.

However, you cannot do kvm_apic_accept_events twice because there would
be a race with INIT: a #DB exception could be first stored by
KVM_GET_VCPU_EVENTS, then disappear when kvm_apic_accept_events
KVM_GET_MPSTATE is called.

Fortunately, the correct order for KVM_GET_* events is first
KVM_GET_VCPU_EVENTS and then KVM_GET_MPSTATE.  So perhaps
instead of calling kvm_deliver_pending_exception on vmexit,
KVM_GET_VCPU_EVENTS could call kvm_apic_accept_events (and thus
kvm_check_nested_events) instead of KVM_GET_MPSTATE.  In addition:
nested_ops.check_events would be split in two, with high-priority
events (triple fault, now in kvm_check_nested_events; INIT; MTF)
remaining in the first and interrupts in the second, tentatively
named check_interrupts).

I'll take a look at cleaning up the current mess we have around
kvm_apic_accept_events, where most of the calls do not have to
handle guest mode at all.

Thanks for this work and for showing that it's possible to fix the
underlying mess with exception vmexit.  It may be a bit more
complicated than this, but it's a great start.

Paolo
Sean Christopherson April 1, 2021, 10:56 p.m. UTC | #2
On Thu, Apr 01, 2021, Paolo Bonzini wrote:
> On 01/04/21 16:38, Maxim Levitsky wrote:
> > +static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu)
> > +{
> > +	int class1, class2, ret;
> > +
> > +	/* try to deliver current pending exception as VM exit */
> > +	if (is_guest_mode(vcpu)) {
> > +		ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
> > +		if (ret || !vcpu->arch.pending_exception.valid)
> > +			return ret;
> > +	}
> > +
> > +	/* No injected exception, so just deliver the payload and inject it */
> > +	if (!vcpu->arch.injected_exception.valid) {
> > +		trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
> > +					vcpu->arch.pending_exception.has_error_code,
> > +					vcpu->arch.pending_exception.error_code);
> > +queue:
> 
> If you move the queue label to the top of the function, you can "goto queue" for #DF as well and you don't need to call kvm_do_deliver_pending_exception again.  In fact you can merge this function and kvm_deliver_pending_exception completely:
> 
> 
> static int kvm_deliver_pending_exception_as_vmexit(struct kvm_vcpu *vcpu)
> {
> 	WARN_ON(!vcpu->arch.pending_exception.valid);
> 	if (is_guest_mode(vcpu))
> 		return kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
> 	else
> 		return 0;
> }
> 
> static int kvm_merge_injected_exception(struct kvm_vcpu *vcpu)
> {
> 	/*
> 	 * First check if the pending exception takes precedence
> 	 * over the injected one, which will be reported in the
> 	 * vmexit info.
> 	 */
> 	ret = kvm_deliver_pending_exception_as_vmexit(vcpu);
> 	if (ret || !vcpu->arch.pending_exception.valid)
> 		return ret;
> 
> 	if (vcpu->arch.injected_exception.nr == DF_VECTOR) {
> 		...
> 		return 0;
> 	}
> 	...
> 	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
> 	    || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
> 		...
> 	}
> 	vcpu->arch.injected_exception.valid = false;
> }
> 
> static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
> {
> 	if (!vcpu->arch.pending_exception.valid)
> 		return 0;
> 
> 	if (vcpu->arch.injected_exception.valid)
> 		kvm_merge_injected_exception(vcpu);
> 
> 	ret = kvm_deliver_pending_exception_as_vmexit(vcpu));
> 	if (ret || !vcpu->arch.pending_exception.valid)

I really don't like querying arch.pending_exception.valid to see if the exception
was morphed to a VM-Exit.  I also find kvm_deliver_pending_exception_as_vmexit()
to be misleading; to me, that reads as being a command, i.e. "deliver this
pending exception as a VM-Exit".

It' also be nice to make the helpers closer to pure functions, i.e. pass the
exception as a param instead of pulling it from vcpu->arch.

Now that we have static_call, the number of calls into vendor code isn't a huge
issue.  Moving nested_run_pending to arch code would help, too.  What about
doing something like:

static bool kvm_l1_wants_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector)
{
	return is_guest_mode(vcpu) && kvm_x86_l1_wants_exception(vcpu, vector);
}

	...

	if (!kvm_x86_exception_allowed(vcpu))
		return -EBUSY;

	if (kvm_l1_wants_exception_vmexit(vcpu, vcpu->arch...))
		return kvm_x86_deliver_exception_as_vmexit(...);

> 		return ret;
> 
> 	trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
> 				vcpu->arch.pending_exception.has_error_code,
> 				vcpu->arch.pending_exception.error_code);
> 	...
> }
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3b2fd276e8d5..a9b9cd030d9a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1346,6 +1346,15 @@  struct kvm_x86_ops {
 
 struct kvm_x86_nested_ops {
 	int (*check_events)(struct kvm_vcpu *vcpu);
+
+	/*
+	 * Deliver a pending exception as a VM exit if the L1 intercepts it.
+	 * Returns -EBUSY if L1 does intercept the exception but,
+	 * it is not possible to deliver it right now.
+	 * (for example when nested run is pending)
+	 */
+	int (*deliver_exception_as_vmexit)(struct kvm_vcpu *vcpu);
+
 	bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
 	void (*triple_fault)(struct kvm_vcpu *vcpu);
 	int (*get_state)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 7adad9b6dcad..ff745d59ffcf 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1061,21 +1061,6 @@  static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (vcpu->arch.pending_exception.valid) {
-		/*
-		 * Only a pending nested run can block a pending exception.
-		 * Otherwise an injected NMI/interrupt should either be
-		 * lost or delivered to the nested hypervisor in the EXITINTINFO
-		 * vmcb field, while delivering the pending exception.
-		 */
-		if (svm->nested.nested_run_pending)
-                        return -EBUSY;
-		if (!nested_exit_on_exception(svm))
-			return 0;
-		nested_svm_inject_exception_vmexit(svm);
-		return 0;
-	}
-
 	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
@@ -1107,6 +1092,17 @@  static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int svm_deliver_nested_exception_as_vmexit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (svm->nested.nested_run_pending)
+		return -EBUSY;
+	if (nested_exit_on_exception(svm))
+		nested_svm_inject_exception_vmexit(svm);
+	return 0;
+}
+
 int nested_svm_exit_special(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
@@ -1321,6 +1317,7 @@  static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 struct kvm_x86_nested_ops svm_nested_ops = {
 	.check_events = svm_check_nested_events,
 	.triple_fault = nested_svm_triple_fault,
+	.deliver_exception_as_vmexit = svm_deliver_nested_exception_as_vmexit,
 	.get_nested_state_pages = svm_get_nested_state_pages,
 	.get_state = svm_get_nested_state,
 	.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 90b541138c5a..b89e48574c39 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -363,7 +363,7 @@  static void svm_queue_exception(struct kvm_vcpu *vcpu)
 	bool has_error_code = vcpu->arch.injected_exception.has_error_code;
 	u32 error_code = vcpu->arch.injected_exception.error_code;
 
-	kvm_deliver_exception_payload(vcpu);
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
 	if (nr == BP_VECTOR && !nrips) {
 		unsigned long rip, old_rip = kvm_rip_read(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5d54fecff9a7..1c09b132c55c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3768,7 +3768,6 @@  static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long exit_qual;
 	bool block_nested_events =
 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
 	bool mtf_pending = vmx->nested.mtf_pending;
@@ -3804,41 +3803,15 @@  static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	/*
-	 * Process any exceptions that are not debug traps before MTF.
-	 *
-	 * Note that only a pending nested run can block a pending exception.
-	 * Otherwise an injected NMI/interrupt should either be
-	 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
-	 * while delivering the pending exception.
-	 */
-
-	if (vcpu->arch.pending_exception.valid && !vmx_pending_dbg_trap(vcpu)) {
-		if (vmx->nested.nested_run_pending)
-			return -EBUSY;
-		if (!nested_vmx_check_exception(vcpu, &exit_qual))
-			goto no_vmexit;
-		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		return 0;
-	}
-
 	if (mtf_pending) {
 		if (block_nested_events)
 			return -EBUSY;
+
 		nested_vmx_update_pending_dbg(vcpu);
 		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
 		return 0;
 	}
 
-	if (vcpu->arch.pending_exception.valid) {
-		if (vmx->nested.nested_run_pending)
-			return -EBUSY;
-		if (!nested_vmx_check_exception(vcpu, &exit_qual))
-			goto no_vmexit;
-		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		return 0;
-	}
-
 	if (nested_vmx_preemption_timer_pending(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
@@ -3884,6 +3857,34 @@  static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int nested_vmx_deliver_exception_as_vmexit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long exit_qual;
+
+	if (vmx->nested.nested_run_pending)
+		return -EBUSY;
+
+	if (vmx->nested.mtf_pending && vmx_pending_dbg_trap(vcpu)) {
+		/*
+		 * A pending monitor trap takes precedence over pending
+		 * debug exception which is 'stashed' into
+		 * 'GUEST_PENDING_DBG_EXCEPTIONS'
+		 */
+
+		nested_vmx_update_pending_dbg(vcpu);
+		vmx->nested.mtf_pending = false;
+		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+		return 0;
+	}
+	if (vcpu->arch.pending_exception.valid) {
+		if (nested_vmx_check_exception(vcpu, &exit_qual))
+			nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+		return 0;
+	}
+	return 0;
+}
+
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
 	ktime_t remaining =
@@ -6603,6 +6604,7 @@  __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 
 struct kvm_x86_nested_ops vmx_nested_ops = {
 	.check_events = vmx_check_nested_events,
+	.deliver_exception_as_vmexit = nested_vmx_deliver_exception_as_vmexit,
 	.hv_timer_pending = nested_vmx_preemption_timer_pending,
 	.triple_fault = nested_vmx_triple_fault,
 	.get_state = vmx_get_nested_state,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a9b241d2b271..fc6bc40d47b0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1682,7 +1682,7 @@  static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 	u32 error_code = vcpu->arch.injected_exception.error_code;
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	kvm_deliver_exception_payload(vcpu);
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 493d87b0c2d5..a363204f37be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -535,86 +535,30 @@  void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-		unsigned nr, bool has_error, u32 error_code,
-	        bool has_payload, unsigned long payload, bool reinject)
+				   unsigned int nr, bool has_error, u32 error_code,
+				   bool has_payload, unsigned long payload,
+				   bool reinject)
 {
-	u32 prev_nr;
-	int class1, class2;
-
+	struct kvm_queued_exception *exc;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
-	if (!vcpu->arch.pending_exception.valid && !vcpu->arch.injected_exception.valid) {
-	queue:
-		if (reinject) {
-			/*
-			 * On vmentry, vcpu->arch.exception.pending is only
-			 * true if an event injection was blocked by
-			 * nested_run_pending.  In that case, however,
-			 * vcpu_enter_guest requests an immediate exit,
-			 * and the guest shouldn't proceed far enough to
-			 * need reinjection.
-			 */
-			WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
-			if (WARN_ON_ONCE(has_payload)) {
-				/*
-				 * A reinjected event has already
-				 * delivered its payload.
-				 */
-				has_payload = false;
-				payload = 0;
-			}
-
-			vcpu->arch.injected_exception.valid = true;
-			vcpu->arch.injected_exception.has_error_code = has_error;
-			vcpu->arch.injected_exception.nr = nr;
-			vcpu->arch.injected_exception.error_code = error_code;
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
+	WARN_ON_ONCE(reinject && vcpu->arch.injected_exception.valid);
 
-		} else {
-			vcpu->arch.pending_exception.valid = true;
-			vcpu->arch.injected_exception.valid = false;
-			vcpu->arch.pending_exception.has_error_code = has_error;
-			vcpu->arch.pending_exception.nr = nr;
-			vcpu->arch.pending_exception.error_code = error_code;
-		}
-
-		vcpu->arch.exception_payload.valid = has_payload;
-		vcpu->arch.exception_payload.value = payload;
-		if (!is_guest_mode(vcpu))
-			kvm_deliver_exception_payload(vcpu);
-		return;
-	}
-
-	/* to check exception */
-	prev_nr = vcpu->arch.injected_exception.nr;
-	if (prev_nr == DF_VECTOR) {
-		/* triple fault -> shutdown */
-		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-		return;
-	}
-	class1 = exception_class(prev_nr);
-	class2 = exception_class(nr);
-	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
-		/*
-		 * Generate double fault per SDM Table 5-5.  Set
-		 * exception.pending = true so that the double fault
-		 * can trigger a nested vmexit.
-		 */
-		vcpu->arch.pending_exception.valid = true;
-		vcpu->arch.injected_exception.valid = false;
-		vcpu->arch.pending_exception.has_error_code = true;
-		vcpu->arch.pending_exception.nr = DF_VECTOR;
-		vcpu->arch.pending_exception.error_code = 0;
+	exc = reinject ? &vcpu->arch.injected_exception :
+			 &vcpu->arch.pending_exception;
+	exc->valid = true;
+	exc->nr = nr;
+	exc->has_error_code = has_error;
+	exc->error_code = error_code;
 
-		vcpu->arch.exception_payload.valid = false;
-		vcpu->arch.exception_payload.value = 0;
-	} else
-		/* replace previous exception with a new one in a hope
-		   that instruction re-execution will regenerate lost
-		   exception */
-		goto queue;
+	// re-injected exception has its payload already delivered
+	WARN_ON_ONCE(reinject && has_payload);
+	vcpu->arch.exception_payload.valid = has_payload;
+	vcpu->arch.exception_payload.value = payload;
 }
 
+
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
 	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
@@ -641,6 +585,95 @@  static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
 			       true, payload, false);
 }
 
+static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu)
+{
+	int class1, class2, ret;
+
+	/* try to deliver current pending exception as VM exit */
+	if (is_guest_mode(vcpu)) {
+		ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
+		if (ret || !vcpu->arch.pending_exception.valid)
+			return ret;
+	}
+
+	/* No injected exception, so just deliver the payload and inject it */
+	if (!vcpu->arch.injected_exception.valid) {
+		trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
+					vcpu->arch.pending_exception.has_error_code,
+					vcpu->arch.pending_exception.error_code);
+queue:
+		/* Intel SDM 17.3.1.1 */
+		if (exception_type(vcpu->arch.pending_exception.nr) == EXCPT_FAULT)
+			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+					     X86_EFLAGS_RF);
+
+		kvm_deliver_exception_payload(vcpu);
+
+		/* Intel SDM 17.2.4
+		 * The processor clears the GD flag upon entering to the
+		 * debug exception handler, to allow the handler access
+		 * to the debug registers.
+		 */
+		if (vcpu->arch.pending_exception.nr == DB_VECTOR) {
+			if (vcpu->arch.dr7 & DR7_GD) {
+				vcpu->arch.dr7 &= ~DR7_GD;
+				kvm_update_dr7(vcpu);
+			}
+		}
+
+		if (vcpu->arch.pending_exception.error_code && !is_protmode(vcpu))
+			vcpu->arch.pending_exception.error_code = false;
+
+		vcpu->arch.injected_exception = vcpu->arch.pending_exception;
+		vcpu->arch.pending_exception.valid = false;
+		return 0;
+	}
+
+	/* Convert a pending exception and an injected #DF to a triple fault*/
+	if (vcpu->arch.injected_exception.nr == DF_VECTOR) {
+		/* triple fault -> shutdown */
+		vcpu->arch.injected_exception.valid = false;
+		vcpu->arch.pending_exception.valid = false;
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		return 0;
+	}
+
+	class1 = exception_class(vcpu->arch.injected_exception.nr);
+	class2 = exception_class(vcpu->arch.pending_exception.nr);
+
+	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+
+		/* Generate double fault per SDM Table 5-5. */
+		vcpu->arch.injected_exception.valid = false;
+		vcpu->arch.pending_exception.valid = true;
+		vcpu->arch.pending_exception.has_error_code = true;
+		vcpu->arch.pending_exception.nr = DF_VECTOR;
+		vcpu->arch.pending_exception.error_code = 0;
+		vcpu->arch.exception_payload.valid = false;
+	} else
+		/* Drop the injected exception and replace it with the pending one*/
+		goto queue;
+
+	return 0;
+}
+
+static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
+{
+	int ret = 0;
+
+	if (!vcpu->arch.pending_exception.valid)
+		return ret;
+
+	ret = kvm_do_deliver_pending_exception(vcpu);
+
+	if (ret || !vcpu->arch.pending_exception.valid)
+		return ret;
+
+	WARN_ON_ONCE(vcpu->arch.pending_exception.nr != DF_VECTOR);
+	return kvm_do_deliver_pending_exception(vcpu);
+}
+
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
 	if (err)
@@ -4297,6 +4330,12 @@  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	    vcpu->arch.pending_exception.valid && vcpu->arch.exception_payload.valid)
 		kvm_deliver_exception_payload(vcpu);
 
+	/*
+	 * Currently we merge the pending and the injected exceptions
+	 * after each VM exit, which can fail only when nested run is pending,
+	 * in which case only injected (from us or L1) exception is possible.
+	 */
+
 	WARN_ON_ONCE(vcpu->arch.pending_exception.valid &&
 		     vcpu->arch.injected_exception.valid);
 
@@ -8401,8 +8440,6 @@  int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.injected_exception.error_code && !is_protmode(vcpu))
-		vcpu->arch.injected_exception.error_code = false;
 	static_call(kvm_x86_queue_exception)(vcpu);
 }
 
@@ -8411,8 +8448,13 @@  static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	int r;
 	bool can_inject = true;
 
-	/* try to reinject previous events if any */
+	r = kvm_deliver_pending_exception(vcpu);
+	if (r < 0)
+		goto busy;
+
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
+	/* try to reinject previous events if any */
 	if (vcpu->arch.injected_exception.valid) {
 		kvm_inject_exception(vcpu);
 		can_inject = false;
@@ -8431,7 +8473,7 @@  static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	 * serviced prior to recognizing any new events in order to
 	 * fully complete the previous instruction.
 	 */
-	else if (!vcpu->arch.pending_exception.valid) {
+	else {
 		if (vcpu->arch.nmi_injected) {
 			static_call(kvm_x86_set_nmi)(vcpu);
 			can_inject = false;
@@ -8441,9 +8483,6 @@  static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 		}
 	}
 
-	WARN_ON_ONCE(vcpu->arch.pending_exception.valid &&
-		     vcpu->arch.injected_exception.valid);
-
 	/*
 	 * Call check_nested_events() even if we reinjected a previous event
 	 * in order for caller to determine if it should require immediate-exit
@@ -8456,30 +8495,6 @@  static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 			goto busy;
 	}
 
-	/* try to inject new event if pending */
-	if (vcpu->arch.pending_exception.valid) {
-		trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
-					vcpu->arch.pending_exception.has_error_code,
-					vcpu->arch.pending_exception.error_code);
-
-		vcpu->arch.injected_exception = vcpu->arch.pending_exception;
-		vcpu->arch.pending_exception.valid = false;
-
-		if (exception_type(vcpu->arch.injected_exception.nr) == EXCPT_FAULT)
-			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
-					     X86_EFLAGS_RF);
-
-		if (vcpu->arch.injected_exception.nr == DB_VECTOR) {
-			kvm_deliver_exception_payload(vcpu);
-			if (vcpu->arch.dr7 & DR7_GD) {
-				vcpu->arch.dr7 &= ~DR7_GD;
-				kvm_update_dr7(vcpu);
-			}
-		}
-
-		kvm_inject_exception(vcpu);
-		can_inject = false;
-	}
 
 	/*
 	 * Finally, inject interrupt events.  If an event cannot be injected
@@ -9270,6 +9285,14 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_lapic_sync_from_vapic(vcpu);
 
 	r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
+
+	/*
+	 * Deliver the pending exception so that the state of having a pending
+	 * and an injected exception is not visible to the userspace.
+	 */
+
+	kvm_deliver_pending_exception(vcpu);
+
 	return r;
 
 cancel_injection:
@@ -11014,7 +11037,7 @@  static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.pv.pv_unhalted)
 		return true;
 
-	if (vcpu->arch.pending_exception.valid)
+	if (vcpu->arch.pending_exception.valid || vcpu->arch.injected_exception.valid)
 		return true;
 
 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||