diff mbox series

[QEMU,v3,8/9] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD

Message ID 20190617175658.135869-9-liran.alon@oracle.com (mailing list archive)
State New, archived
Headers show
Series : KVM: i386: Add support for save and restore of nested state | expand

Commit Message

Liran Alon June 17, 2019, 5:56 p.m. UTC
Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
introduced a new KVM capability which allows userspace to correctly
distinguish between pending and injected exceptions.

This distinguish is important in case of nested virtualization scenarios
because a L2 pending exception can still be intercepted by the L1 hypervisor
while a L2 injected exception cannot.

Furthermore, when an exception is attempted to be injected by QEMU,
QEMU should specify the exception payload (CR2 in case of #PF or
DR6 in case of #DB) instead of having the payload already delivered in
the respective vCPU register. Because in case exception is injected to
L2 guest and is intercepted by L1 hypervisor, then payload needs to be
reported to L1 intercept (VMExit handler) while still preserving
respective vCPU register unchanged.

This commit adds support for QEMU to properly utilise this new KVM
capability (KVM_CAP_EXCEPTION_PAYLOAD).

Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
---
 target/i386/cpu.c        |   6 ++-
 target/i386/cpu.h        |   6 ++-
 target/i386/hvf/hvf.c    |  10 ++--
 target/i386/hvf/x86hvf.c |   4 +-
 target/i386/kvm.c        | 101 ++++++++++++++++++++++++++++++++-------
 target/i386/machine.c    |  84 +++++++++++++++++++++++++++++++-
 6 files changed, 187 insertions(+), 24 deletions(-)

Comments

Dr. David Alan Gilbert June 18, 2019, 9:07 a.m. UTC | #1
* Liran Alon (liran.alon@oracle.com) wrote:
> Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
> introduced a new KVM capability which allows userspace to correctly
> distinguish between pending and injected exceptions.
> 
> This distinguish is important in case of nested virtualization scenarios
> because a L2 pending exception can still be intercepted by the L1 hypervisor
> while a L2 injected exception cannot.
> 
> Furthermore, when an exception is attempted to be injected by QEMU,
> QEMU should specify the exception payload (CR2 in case of #PF or
> DR6 in case of #DB) instead of having the payload already delivered in
> the respective vCPU register. Because in case exception is injected to
> L2 guest and is intercepted by L1 hypervisor, then payload needs to be
> reported to L1 intercept (VMExit handler) while still preserving
> respective vCPU register unchanged.
> 
> This commit adds support for QEMU to properly utilise this new KVM
> capability (KVM_CAP_EXCEPTION_PAYLOAD).

Does this kvm capability become a requirement for the nested migration
then? If so, is it wired into the blockers?

Dave

> Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
> Signed-off-by: Liran Alon <liran.alon@oracle.com>
> ---
>  target/i386/cpu.c        |   6 ++-
>  target/i386/cpu.h        |   6 ++-
>  target/i386/hvf/hvf.c    |  10 ++--
>  target/i386/hvf/x86hvf.c |   4 +-
>  target/i386/kvm.c        | 101 ++++++++++++++++++++++++++++++++-------
>  target/i386/machine.c    |  84 +++++++++++++++++++++++++++++++-
>  6 files changed, 187 insertions(+), 24 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 197201087e65..a026e49f5c0d 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -4774,7 +4774,11 @@ static void x86_cpu_reset(CPUState *s)
>      memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
>  
>      env->interrupt_injected = -1;
> -    env->exception_injected = -1;
> +    env->exception_nr = -1;
> +    env->exception_pending = 0;
> +    env->exception_injected = 0;
> +    env->exception_has_payload = false;
> +    env->exception_payload = 0;
>      env->nmi_injected = false;
>  #if !defined(CONFIG_USER_ONLY)
>      /* We hard-wire the BSP to the first CPU. */
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index a6bb71849869..e2ac4132972d 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1338,10 +1338,14 @@ typedef struct CPUX86State {
>  
>      /* For KVM */
>      uint32_t mp_state;
> -    int32_t exception_injected;
> +    int32_t exception_nr;
>      int32_t interrupt_injected;
>      uint8_t soft_interrupt;
> +    uint8_t exception_pending;
> +    uint8_t exception_injected;
>      uint8_t has_error_code;
> +    uint8_t exception_has_payload;
> +    uint64_t exception_payload;
>      uint32_t ins_len;
>      uint32_t sipi_vector;
>      bool tsc_valid;
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index 2751c8125ca2..dc4bb63536c8 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
>      X86CPU *x86_cpu = X86_CPU(cpu);
>      CPUX86State *env = &x86_cpu->env;
>  
> -    env->exception_injected = -1;
> +    env->exception_nr = -1;
> +    env->exception_pending = 0;
> +    env->exception_injected = 0;
>      env->interrupt_injected = -1;
>      env->nmi_injected = false;
>      if (idtvec_info & VMCS_IDT_VEC_VALID) {
> @@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
>              break;
>          case VMCS_IDT_VEC_HWEXCEPTION:
>          case VMCS_IDT_VEC_SWEXCEPTION:
> -            env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
> +            env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
> +            env->exception_injected = 1;
>              break;
>          case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
>          default:
> @@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu)
>              macvm_set_rip(cpu, rip + ins_len);
>              break;
>          case VMX_REASON_VMCALL:
> -            env->exception_injected = EXCP0D_GPF;
> +            env->exception_nr = EXCP0D_GPF;
> +            env->exception_injected = 1;
>              env->has_error_code = true;
>              env->error_code = 0;
>              break;
> diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
> index df8e946fbcde..e0ea02d631e6 100644
> --- a/target/i386/hvf/x86hvf.c
> +++ b/target/i386/hvf/x86hvf.c
> @@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state)
>      if (env->interrupt_injected != -1) {
>          vector = env->interrupt_injected;
>          intr_type = VMCS_INTR_T_SWINTR;
> -    } else if (env->exception_injected != -1) {
> -        vector = env->exception_injected;
> +    } else if (env->exception_nr != -1) {
> +        vector = env->exception_nr;
>          if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
>              intr_type = VMCS_INTR_T_SWEXCEPTION;
>          } else {
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 5950c3ed0d1c..797f8ac46435 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters;
>  static int has_xsave;
>  static int has_xcrs;
>  static int has_pit_state2;
> +static int has_exception_payload;
>  
>  static bool has_msr_mcg_ext_ctl;
>  
> @@ -584,15 +585,56 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>      /* Hope we are lucky for AO MCE */
>  }
>  
> +static void kvm_reset_exception(CPUX86State *env)
> +{
> +	env->exception_nr = -1;
> +	env->exception_pending = 0;
> +	env->exception_injected = 0;
> +	env->exception_has_payload = false;
> +	env->exception_payload = 0;
> +}
> +
> +static void kvm_queue_exception(CPUX86State *env,
> +                                int32_t exception_nr,
> +                                uint8_t exception_has_payload,
> +                                uint64_t exception_payload)
> +{
> +    assert(env->exception_nr == -1);
> +    assert(!env->exception_pending);
> +    assert(!env->exception_injected);
> +    assert(!env->exception_has_payload);
> +
> +    env->exception_nr = exception_nr;
> +
> +    if (has_exception_payload) {
> +        env->exception_pending = 1;
> +
> +        env->exception_has_payload = exception_has_payload;
> +        env->exception_payload = exception_payload;
> +    } else {
> +        env->exception_injected = 1;
> +
> +        if (exception_nr == EXCP01_DB) {
> +            assert(exception_has_payload);
> +            env->dr[6] = exception_payload;
> +        } else if (exception_nr == EXCP0E_PAGE) {
> +            assert(exception_has_payload);
> +            env->cr[2] = exception_payload;
> +        } else {
> +            assert(!exception_has_payload);
> +        }
> +    }
> +}
> +
>  static int kvm_inject_mce_oldstyle(X86CPU *cpu)
>  {
>      CPUX86State *env = &cpu->env;
>  
> -    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
> +    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
>          unsigned int bank, bank_num = env->mcg_cap & 0xff;
>          struct kvm_x86_mce mce;
>  
> -        env->exception_injected = -1;
> +        kvm_reset_exception(env);
>  
>          /*
>           * There must be at least one bank in use if an MCE is pending.
> @@ -1610,6 +1652,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>  
>      hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
>  
> +    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
> +    if (has_exception_payload) {
> +        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
> +        if (ret < 0) {
> +            error_report("kvm: Failed to enable exception payload cap: %s",
> +                         strerror(-ret));
> +            return ret;
> +        }
> +    }
> +
>      ret = kvm_get_supported_msrs(s);
>      if (ret < 0) {
>          return ret;
> @@ -2914,8 +2966,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
>          return 0;
>      }
>  
> -    events.exception.injected = (env->exception_injected >= 0);
> -    events.exception.nr = env->exception_injected;
> +    events.flags = 0;
> +
> +    if (has_exception_payload) {
> +        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
> +        events.exception.pending = env->exception_pending;
> +        events.exception_has_payload = env->exception_has_payload;
> +        events.exception_payload = env->exception_payload;
> +    }
> +    events.exception.nr = env->exception_nr;
> +    events.exception.injected = env->exception_injected;
>      events.exception.has_error_code = env->has_error_code;
>      events.exception.error_code = env->error_code;
>  
> @@ -2928,7 +2988,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
>      events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
>  
>      events.sipi_vector = env->sipi_vector;
> -    events.flags = 0;
>  
>      if (has_msr_smbase) {
>          events.smi.smm = !!(env->hflags & HF_SMM_MASK);
> @@ -2978,8 +3037,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
>      if (ret < 0) {
>         return ret;
>      }
> -    env->exception_injected =
> -       events.exception.injected ? events.exception.nr : -1;
> +
> +    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
> +        env->exception_pending = events.exception.pending;
> +        env->exception_has_payload = events.exception_has_payload;
> +        env->exception_payload = events.exception_payload;
> +    } else {
> +        env->exception_pending = 0;
> +        env->exception_has_payload = false;
> +    }
> +    env->exception_injected = events.exception.injected;
> +    env->exception_nr =
> +        (env->exception_pending || env->exception_injected) ?
> +        events.exception.nr : -1;
>      env->has_error_code = events.exception.has_error_code;
>      env->error_code = events.exception.error_code;
>  
> @@ -3031,12 +3101,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu)
>      unsigned long reinject_trap = 0;
>  
>      if (!kvm_has_vcpu_events()) {
> -        if (env->exception_injected == EXCP01_DB) {
> +        if (env->exception_nr == EXCP01_DB) {
>              reinject_trap = KVM_GUESTDBG_INJECT_DB;
>          } else if (env->exception_injected == EXCP03_INT3) {
>              reinject_trap = KVM_GUESTDBG_INJECT_BP;
>          }
> -        env->exception_injected = -1;
> +        kvm_reset_exception(env);
>      }
>  
>      /*
> @@ -3412,13 +3482,13 @@ int kvm_arch_process_async_events(CPUState *cs)
>  
>          kvm_cpu_synchronize_state(cs);
>  
> -        if (env->exception_injected == EXCP08_DBLE) {
> +        if (env->exception_nr == EXCP08_DBLE) {
>              /* this means triple fault */
>              qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
>              cs->exit_request = 1;
>              return 0;
>          }
> -        env->exception_injected = EXCP12_MCHK;
> +        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
>          env->has_error_code = 0;
>  
>          cs->halted = 0;
> @@ -3633,14 +3703,13 @@ static int kvm_handle_debug(X86CPU *cpu,
>      }
>      if (ret == 0) {
>          cpu_synchronize_state(cs);
> -        assert(env->exception_injected == -1);
> +        assert(env->exception_nr == -1);
>  
>          /* pass to guest */
> -        env->exception_injected = arch_info->exception;
> +        kvm_queue_exception(env, arch_info->exception,
> +                            arch_info->exception == EXCP01_DB,
> +                            arch_info->dr6);
>          env->has_error_code = 0;
> -        if (arch_info->exception == EXCP01_DB) {
> -            env->dr[6] = arch_info->dr6;
> -        }
>      }
>  
>      return ret;
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index 95299ebff44a..6aac0fe9cb56 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -240,6 +240,41 @@ static int cpu_pre_save(void *opaque)
>      }
>  #endif
>  
> +    /*
> +     * When vCPU is running L2 and exception is still pending,
> +     * it can potentially be intercepted by L1 hypervisor.
> +     * In contrast to an injected exception which cannot be
> +     * intercepted anymore.
> +     *
> +     * Furthermore, when a L2 exception is intercepted by L1
> +     * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB)
> +     * should not be set yet in the respective vCPU register.
> +     * Thus, in case an exception is pending, it is
> +     * important to save the exception payload seperately.
> +     *
> +     * Therefore, if an exception is not in a pending state
> +     * or vCPU is not in guest-mode, it is not important to
> +     * distinguish between a pending and injected exception
> +     * and we don't need to store seperately the exception payload.
> +     *
> +     * In order to preserve better backwards-compatabile migration,
> +     * convert a pending exception to an injected exception in
> +     * case it is not important to distingiush between them
> +     * as described above.
> +     */
> +    if (env->exception_pending && !(env->hflags & HF_GUEST_MASK)) {
> +        env->exception_pending = 0;
> +        env->exception_injected = 1;
> +
> +        if (env->exception_has_payload) {
> +            if (env->exception_nr == EXCP01_DB) {
> +                env->dr[6] = env->exception_payload;
> +            } else if (env->exception_nr == EXCP0E_PAGE) {
> +                env->cr[2] = env->exception_payload;
> +            }
> +        }
> +    }
> +
>      return 0;
>  }
>  
> @@ -297,6 +332,23 @@ static int cpu_post_load(void *opaque, int version_id)
>      }
>  #endif
>  
> +    /*
> +     * There are cases that we can get valid exception_nr with both
> +     * exception_pending and exception_injected being cleared.
> +     * This can happen in one of the following scenarios:
> +     * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
> +     * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support.
> +     * 3) "cpu/exception_info" subsection not sent because there is no exception
> +     *	  pending or guest wasn't running L2 (See comment in cpu_pre_save()).
> +     *
> +     * In those cases, we can just deduce that a valid exception_nr means
> +     * we can treat the exception as already injected.
> +     */
> +    if ((env->exception_nr != -1) &&
> +        !env->exception_pending && !env->exception_injected) {
> +        env->exception_injected = 1;
> +    }
> +
>      env->fpstt = (env->fpus_vmstate >> 11) & 7;
>      env->fpus = env->fpus_vmstate & ~0x3800;
>      env->fptag_vmstate ^= 0xff;
> @@ -342,6 +394,35 @@ static bool steal_time_msr_needed(void *opaque)
>      return cpu->env.steal_time_msr != 0;
>  }
>  
> +static bool exception_info_needed(void *opaque)
> +{
> +    X86CPU *cpu = opaque;
> +    CPUX86State *env = &cpu->env;
> +
> +    /*
> +     * It is important to save exception-info only in case
> +     * we need to distingiush between a pending and injected
> +     * exception. Which is only required in case there is a
> +     * pending exception and vCPU is running L2.
> +     * For more info, refer to comment in cpu_pre_save().
> +     */
> +    return (env->exception_pending && (env->hflags & HF_GUEST_MASK));
> +}
> +
> +static const VMStateDescription vmstate_exception_info = {
> +    .name = "cpu/exception_info",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = exception_info_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT8(env.exception_pending, X86CPU),
> +        VMSTATE_UINT8(env.exception_injected, X86CPU),
> +        VMSTATE_UINT8(env.exception_has_payload, X86CPU),
> +        VMSTATE_UINT64(env.exception_payload, X86CPU),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_steal_time_msr = {
>      .name = "cpu/steal_time_msr",
>      .version_id = 1,
> @@ -1228,7 +1309,7 @@ VMStateDescription vmstate_x86_cpu = {
>          VMSTATE_INT32(env.interrupt_injected, X86CPU),
>          VMSTATE_UINT32(env.mp_state, X86CPU),
>          VMSTATE_UINT64(env.tsc, X86CPU),
> -        VMSTATE_INT32(env.exception_injected, X86CPU),
> +        VMSTATE_INT32(env.exception_nr, X86CPU),
>          VMSTATE_UINT8(env.soft_interrupt, X86CPU),
>          VMSTATE_UINT8(env.nmi_injected, X86CPU),
>          VMSTATE_UINT8(env.nmi_pending, X86CPU),
> @@ -1252,6 +1333,7 @@ VMStateDescription vmstate_x86_cpu = {
>          /* The above list is not sorted /wrt version numbers, watch out! */
>      },
>      .subsections = (const VMStateDescription*[]) {
> +        &vmstate_exception_info,
>          &vmstate_async_pf_msr,
>          &vmstate_pv_eoi_msr,
>          &vmstate_steal_time_msr,
> -- 
> 2.20.1
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Liran Alon June 18, 2019, 3:45 p.m. UTC | #2
> On 18 Jun 2019, at 12:07, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> 
> * Liran Alon (liran.alon@oracle.com) wrote:
>> Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
>> introduced a new KVM capability which allows userspace to correctly
>> distinguish between pending and injected exceptions.
>> 
>> This distinguish is important in case of nested virtualization scenarios
>> because a L2 pending exception can still be intercepted by the L1 hypervisor
>> while a L2 injected exception cannot.
>> 
>> Furthermore, when an exception is attempted to be injected by QEMU,
>> QEMU should specify the exception payload (CR2 in case of #PF or
>> DR6 in case of #DB) instead of having the payload already delivered in
>> the respective vCPU register. Because in case exception is injected to
>> L2 guest and is intercepted by L1 hypervisor, then payload needs to be
>> reported to L1 intercept (VMExit handler) while still preserving
>> respective vCPU register unchanged.
>> 
>> This commit adds support for QEMU to properly utilise this new KVM
>> capability (KVM_CAP_EXCEPTION_PAYLOAD).
> 
> Does this kvm capability become a requirement for the nested migration
> then? If so, is it wired into the blockers?
> 
> Dave
> 

That’s a very good point.
Yes this capability is required in order to correctly migrate VMs running nested hypervisors.
I agree that I should add a migration blocker for nested in case it isn’t present.
Nice catch.

-Liran
diff mbox series

Patch

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 197201087e65..a026e49f5c0d 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4774,7 +4774,11 @@  static void x86_cpu_reset(CPUState *s)
     memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
 
     env->interrupt_injected = -1;
-    env->exception_injected = -1;
+    env->exception_nr = -1;
+    env->exception_pending = 0;
+    env->exception_injected = 0;
+    env->exception_has_payload = false;
+    env->exception_payload = 0;
     env->nmi_injected = false;
 #if !defined(CONFIG_USER_ONLY)
     /* We hard-wire the BSP to the first CPU. */
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index a6bb71849869..e2ac4132972d 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1338,10 +1338,14 @@  typedef struct CPUX86State {
 
     /* For KVM */
     uint32_t mp_state;
-    int32_t exception_injected;
+    int32_t exception_nr;
     int32_t interrupt_injected;
     uint8_t soft_interrupt;
+    uint8_t exception_pending;
+    uint8_t exception_injected;
     uint8_t has_error_code;
+    uint8_t exception_has_payload;
+    uint64_t exception_payload;
     uint32_t ins_len;
     uint32_t sipi_vector;
     bool tsc_valid;
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index 2751c8125ca2..dc4bb63536c8 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -605,7 +605,9 @@  static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
 
-    env->exception_injected = -1;
+    env->exception_nr = -1;
+    env->exception_pending = 0;
+    env->exception_injected = 0;
     env->interrupt_injected = -1;
     env->nmi_injected = false;
     if (idtvec_info & VMCS_IDT_VEC_VALID) {
@@ -619,7 +621,8 @@  static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
             break;
         case VMCS_IDT_VEC_HWEXCEPTION:
         case VMCS_IDT_VEC_SWEXCEPTION:
-            env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
+            env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
+            env->exception_injected = 1;
             break;
         case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
         default:
@@ -912,7 +915,8 @@  int hvf_vcpu_exec(CPUState *cpu)
             macvm_set_rip(cpu, rip + ins_len);
             break;
         case VMX_REASON_VMCALL:
-            env->exception_injected = EXCP0D_GPF;
+            env->exception_nr = EXCP0D_GPF;
+            env->exception_injected = 1;
             env->has_error_code = true;
             env->error_code = 0;
             break;
diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
index df8e946fbcde..e0ea02d631e6 100644
--- a/target/i386/hvf/x86hvf.c
+++ b/target/i386/hvf/x86hvf.c
@@ -362,8 +362,8 @@  bool hvf_inject_interrupts(CPUState *cpu_state)
     if (env->interrupt_injected != -1) {
         vector = env->interrupt_injected;
         intr_type = VMCS_INTR_T_SWINTR;
-    } else if (env->exception_injected != -1) {
-        vector = env->exception_injected;
+    } else if (env->exception_nr != -1) {
+        vector = env->exception_nr;
         if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
             intr_type = VMCS_INTR_T_SWEXCEPTION;
         } else {
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 5950c3ed0d1c..797f8ac46435 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -104,6 +104,7 @@  static uint32_t num_architectural_pmu_fixed_counters;
 static int has_xsave;
 static int has_xcrs;
 static int has_pit_state2;
+static int has_exception_payload;
 
 static bool has_msr_mcg_ext_ctl;
 
@@ -584,15 +585,56 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
     /* Hope we are lucky for AO MCE */
 }
 
+static void kvm_reset_exception(CPUX86State *env)
+{
+	env->exception_nr = -1;
+	env->exception_pending = 0;
+	env->exception_injected = 0;
+	env->exception_has_payload = false;
+	env->exception_payload = 0;
+}
+
+static void kvm_queue_exception(CPUX86State *env,
+                                int32_t exception_nr,
+                                uint8_t exception_has_payload,
+                                uint64_t exception_payload)
+{
+    assert(env->exception_nr == -1);
+    assert(!env->exception_pending);
+    assert(!env->exception_injected);
+    assert(!env->exception_has_payload);
+
+    env->exception_nr = exception_nr;
+
+    if (has_exception_payload) {
+        env->exception_pending = 1;
+
+        env->exception_has_payload = exception_has_payload;
+        env->exception_payload = exception_payload;
+    } else {
+        env->exception_injected = 1;
+
+        if (exception_nr == EXCP01_DB) {
+            assert(exception_has_payload);
+            env->dr[6] = exception_payload;
+        } else if (exception_nr == EXCP0E_PAGE) {
+            assert(exception_has_payload);
+            env->cr[2] = exception_payload;
+        } else {
+            assert(!exception_has_payload);
+        }
+    }
+}
+
 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
 {
     CPUX86State *env = &cpu->env;
 
-    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
+    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
         unsigned int bank, bank_num = env->mcg_cap & 0xff;
         struct kvm_x86_mce mce;
 
-        env->exception_injected = -1;
+        kvm_reset_exception(env);
 
         /*
          * There must be at least one bank in use if an MCE is pending.
@@ -1610,6 +1652,16 @@  int kvm_arch_init(MachineState *ms, KVMState *s)
 
     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
 
+    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
+    if (has_exception_payload) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
+        if (ret < 0) {
+            error_report("kvm: Failed to enable exception payload cap: %s",
+                         strerror(-ret));
+            return ret;
+        }
+    }
+
     ret = kvm_get_supported_msrs(s);
     if (ret < 0) {
         return ret;
@@ -2914,8 +2966,16 @@  static int kvm_put_vcpu_events(X86CPU *cpu, int level)
         return 0;
     }
 
-    events.exception.injected = (env->exception_injected >= 0);
-    events.exception.nr = env->exception_injected;
+    events.flags = 0;
+
+    if (has_exception_payload) {
+        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+        events.exception.pending = env->exception_pending;
+        events.exception_has_payload = env->exception_has_payload;
+        events.exception_payload = env->exception_payload;
+    }
+    events.exception.nr = env->exception_nr;
+    events.exception.injected = env->exception_injected;
     events.exception.has_error_code = env->has_error_code;
     events.exception.error_code = env->error_code;
 
@@ -2928,7 +2988,6 @@  static int kvm_put_vcpu_events(X86CPU *cpu, int level)
     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
 
     events.sipi_vector = env->sipi_vector;
-    events.flags = 0;
 
     if (has_msr_smbase) {
         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
@@ -2978,8 +3037,19 @@  static int kvm_get_vcpu_events(X86CPU *cpu)
     if (ret < 0) {
        return ret;
     }
-    env->exception_injected =
-       events.exception.injected ? events.exception.nr : -1;
+
+    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+        env->exception_pending = events.exception.pending;
+        env->exception_has_payload = events.exception_has_payload;
+        env->exception_payload = events.exception_payload;
+    } else {
+        env->exception_pending = 0;
+        env->exception_has_payload = false;
+    }
+    env->exception_injected = events.exception.injected;
+    env->exception_nr =
+        (env->exception_pending || env->exception_injected) ?
+        events.exception.nr : -1;
     env->has_error_code = events.exception.has_error_code;
     env->error_code = events.exception.error_code;
 
@@ -3031,12 +3101,12 @@  static int kvm_guest_debug_workarounds(X86CPU *cpu)
     unsigned long reinject_trap = 0;
 
     if (!kvm_has_vcpu_events()) {
-        if (env->exception_injected == EXCP01_DB) {
+        if (env->exception_nr == EXCP01_DB) {
             reinject_trap = KVM_GUESTDBG_INJECT_DB;
         } else if (env->exception_injected == EXCP03_INT3) {
             reinject_trap = KVM_GUESTDBG_INJECT_BP;
         }
-        env->exception_injected = -1;
+        kvm_reset_exception(env);
     }
 
     /*
@@ -3412,13 +3482,13 @@  int kvm_arch_process_async_events(CPUState *cs)
 
         kvm_cpu_synchronize_state(cs);
 
-        if (env->exception_injected == EXCP08_DBLE) {
+        if (env->exception_nr == EXCP08_DBLE) {
             /* this means triple fault */
             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
             cs->exit_request = 1;
             return 0;
         }
-        env->exception_injected = EXCP12_MCHK;
+        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
         env->has_error_code = 0;
 
         cs->halted = 0;
@@ -3633,14 +3703,13 @@  static int kvm_handle_debug(X86CPU *cpu,
     }
     if (ret == 0) {
         cpu_synchronize_state(cs);
-        assert(env->exception_injected == -1);
+        assert(env->exception_nr == -1);
 
         /* pass to guest */
-        env->exception_injected = arch_info->exception;
+        kvm_queue_exception(env, arch_info->exception,
+                            arch_info->exception == EXCP01_DB,
+                            arch_info->dr6);
         env->has_error_code = 0;
-        if (arch_info->exception == EXCP01_DB) {
-            env->dr[6] = arch_info->dr6;
-        }
     }
 
     return ret;
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 95299ebff44a..6aac0fe9cb56 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -240,6 +240,41 @@  static int cpu_pre_save(void *opaque)
     }
 #endif
 
+    /*
+     * When vCPU is running L2 and exception is still pending,
+     * it can potentially be intercepted by L1 hypervisor.
+     * In contrast to an injected exception which cannot be
+     * intercepted anymore.
+     *
+     * Furthermore, when a L2 exception is intercepted by L1
+     * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB)
+     * should not be set yet in the respective vCPU register.
+     * Thus, in case an exception is pending, it is
+     * important to save the exception payload seperately.
+     *
+     * Therefore, if an exception is not in a pending state
+     * or vCPU is not in guest-mode, it is not important to
+     * distinguish between a pending and injected exception
+     * and we don't need to store seperately the exception payload.
+     *
+     * In order to preserve better backwards-compatabile migration,
+     * convert a pending exception to an injected exception in
+     * case it is not important to distingiush between them
+     * as described above.
+     */
+    if (env->exception_pending && !(env->hflags & HF_GUEST_MASK)) {
+        env->exception_pending = 0;
+        env->exception_injected = 1;
+
+        if (env->exception_has_payload) {
+            if (env->exception_nr == EXCP01_DB) {
+                env->dr[6] = env->exception_payload;
+            } else if (env->exception_nr == EXCP0E_PAGE) {
+                env->cr[2] = env->exception_payload;
+            }
+        }
+    }
+
     return 0;
 }
 
@@ -297,6 +332,23 @@  static int cpu_post_load(void *opaque, int version_id)
     }
 #endif
 
+    /*
+     * There are cases that we can get valid exception_nr with both
+     * exception_pending and exception_injected being cleared.
+     * This can happen in one of the following scenarios:
+     * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
+     * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support.
+     * 3) "cpu/exception_info" subsection not sent because there is no exception
+     *	  pending or guest wasn't running L2 (See comment in cpu_pre_save()).
+     *
+     * In those cases, we can just deduce that a valid exception_nr means
+     * we can treat the exception as already injected.
+     */
+    if ((env->exception_nr != -1) &&
+        !env->exception_pending && !env->exception_injected) {
+        env->exception_injected = 1;
+    }
+
     env->fpstt = (env->fpus_vmstate >> 11) & 7;
     env->fpus = env->fpus_vmstate & ~0x3800;
     env->fptag_vmstate ^= 0xff;
@@ -342,6 +394,35 @@  static bool steal_time_msr_needed(void *opaque)
     return cpu->env.steal_time_msr != 0;
 }
 
+static bool exception_info_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+
+    /*
+     * It is important to save exception-info only in case
+     * we need to distingiush between a pending and injected
+     * exception. Which is only required in case there is a
+     * pending exception and vCPU is running L2.
+     * For more info, refer to comment in cpu_pre_save().
+     */
+    return (env->exception_pending && (env->hflags & HF_GUEST_MASK));
+}
+
+static const VMStateDescription vmstate_exception_info = {
+    .name = "cpu/exception_info",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = exception_info_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(env.exception_pending, X86CPU),
+        VMSTATE_UINT8(env.exception_injected, X86CPU),
+        VMSTATE_UINT8(env.exception_has_payload, X86CPU),
+        VMSTATE_UINT64(env.exception_payload, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_steal_time_msr = {
     .name = "cpu/steal_time_msr",
     .version_id = 1,
@@ -1228,7 +1309,7 @@  VMStateDescription vmstate_x86_cpu = {
         VMSTATE_INT32(env.interrupt_injected, X86CPU),
         VMSTATE_UINT32(env.mp_state, X86CPU),
         VMSTATE_UINT64(env.tsc, X86CPU),
-        VMSTATE_INT32(env.exception_injected, X86CPU),
+        VMSTATE_INT32(env.exception_nr, X86CPU),
         VMSTATE_UINT8(env.soft_interrupt, X86CPU),
         VMSTATE_UINT8(env.nmi_injected, X86CPU),
         VMSTATE_UINT8(env.nmi_pending, X86CPU),
@@ -1252,6 +1333,7 @@  VMStateDescription vmstate_x86_cpu = {
         /* The above list is not sorted /wrt version numbers, watch out! */
     },
     .subsections = (const VMStateDescription*[]) {
+        &vmstate_exception_info,
         &vmstate_async_pf_msr,
         &vmstate_pv_eoi_msr,
         &vmstate_steal_time_msr,