diff mbox

[v2,3/4] KVM: Dirty memory tracking for performant checkpointing solutions

Message ID CY1PR08MB199264AA4EF263118B64D0CEF0610@CY1PR08MB1992.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei Jan. 4, 2017, 8:44 p.m. UTC
Implement dirty list full forcing vcpus to exit.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
---
 arch/x86/include/asm/kvm_host.h |  7 +++++++
 arch/x86/kvm/mmu.c              |  7 +++++++
 arch/x86/kvm/vmx.c              |  7 +++++++
 arch/x86/kvm/x86.c              | 10 ++++++++++
 include/linux/kvm_host.h        |  1 +
 include/uapi/linux/kvm.h        |  1 +
 virt/kvm/kvm_main.c             | 36 ++++++++++++++++++++++++++++++++++++
 7 files changed, 69 insertions(+)

Comments

Wanpeng Li Jan. 5, 2017, 3:01 a.m. UTC | #1
Hi Cao,
2017-01-05 4:44 GMT+08:00 Cao, Lei <Lei.Cao@stratus.com>:
> Implement dirty list full forcing vcpus to exit.
>

I saw your presentation slides in kvm forum, you mentioned that "Cpu
throttling may not be effective for some workloads where memory write
speed is not dependent on CPU execution speed", could you point out
which kinds of workload where memory write speed is not dependent on
CPU execution speed? If the memory is mainly dirtied by DMA or
something else in this workload?

Regards,
Wanpeng Li

> Signed-off-by: Lei Cao <lei.cao@stratus.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 +++++++
>  arch/x86/kvm/mmu.c              |  7 +++++++
>  arch/x86/kvm/vmx.c              |  7 +++++++
>  arch/x86/kvm/x86.c              | 10 ++++++++++
>  include/linux/kvm_host.h        |  1 +
>  include/uapi/linux/kvm.h        |  1 +
>  virt/kvm/kvm_main.c             | 36 ++++++++++++++++++++++++++++++++++++
>  7 files changed, 69 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6dfb14a..20a9fc8 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -75,6 +75,7 @@
>  #define KVM_REQ_HV_RESET          28
>  #define KVM_REQ_HV_EXIT           29
>  #define KVM_REQ_HV_STIMER         30
> +#define KVM_REQ_EXIT_DIRTY_LOG_FULL 31
>
>  #define CR0_RESERVED_BITS                                               \
>         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -997,6 +998,8 @@ struct kvm_x86_ops {
>          *  - enable_log_dirty_pt_masked:
>          *      called when reenabling log dirty for the GFNs in the mask after
>          *      corresponding bits are cleared in slot->dirty_bitmap.
> +        *  - cpu_dirty_log_size:
> +        *      called to inquire about the size of the hardware dirty log
>          */
>         void (*slot_enable_log_dirty)(struct kvm *kvm,
>                                       struct kvm_memory_slot *slot);
> @@ -1006,6 +1009,8 @@ struct kvm_x86_ops {
>         void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
>                                            struct kvm_memory_slot *slot,
>                                            gfn_t offset, unsigned long mask);
> +       int (*cpu_dirty_log_size)(void);
> +
>         /* pmu operations of sub-arch */
>         const struct kvm_pmu_ops *pmu_ops;
>
> @@ -1388,6 +1393,8 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>                      struct kvm_lapic_irq *irq);
>
> +int kvm_mt_cpu_dirty_log_size(void);
> +
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
>  {
>         if (kvm_x86_ops->vcpu_blocking)
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 7012de4..e0668a0 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4980,6 +4980,13 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
>         }
>  }
>
> +int kvm_mt_cpu_dirty_log_size(void)
> +{
> +       if (kvm_x86_ops->cpu_dirty_log_size)
> +               return kvm_x86_ops->cpu_dirty_log_size();
> +       return 0;
> +}
> +
>  static unsigned long
>  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  {
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index ba20b00..76f88b0 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6729,6 +6729,7 @@ static __init int hardware_setup(void)
>                 kvm_x86_ops->slot_disable_log_dirty = NULL;
>                 kvm_x86_ops->flush_log_dirty = NULL;
>                 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
> +               kvm_x86_ops->cpu_dirty_log_size = NULL;
>         }
>
>         if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
> @@ -11503,6 +11504,11 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
>                         ~FEATURE_CONTROL_LMCE;
>  }
>
> +static int vmx_cpu_dirty_log_size(void)
> +{
> +       return PML_ENTITY_NUM;
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>         .cpu_has_kvm_support = cpu_has_kvm_support,
>         .disabled_by_bios = vmx_disabled_by_bios,
> @@ -11617,6 +11623,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>         .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
>         .flush_log_dirty = vmx_flush_log_dirty,
>         .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
> +       .cpu_dirty_log_size = vmx_cpu_dirty_log_size,
>
>         .pre_block = vmx_pre_block,
>         .post_block = vmx_post_block,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5707129..e2f4cee 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6714,6 +6714,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>                  */
>                 if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
>                         kvm_hv_process_stimers(vcpu);
> +               if (kvm_check_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, vcpu)) {
> +                       vcpu->run->exit_reason = KVM_EXIT_DIRTY_LOG_FULL;
> +                       r = -EINTR;
> +                       if (vcpu->need_exit) {
> +                               vcpu->need_exit = false;
> +                               kvm_make_all_cpus_request(vcpu->kvm,
> +                                       KVM_REQ_EXIT_DIRTY_LOG_FULL);
> +                       }
> +                       goto out;
> +               }
>         }
>
>         /*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7a85b30..b7fedeb 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -283,6 +283,7 @@ struct kvm_vcpu {
>         struct dentry *debugfs_dentry;
>  #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
>         struct gfn_list_t *dirty_logs;
> +       bool need_exit;
>  #endif
>  };
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 05332de..bacb8db 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -205,6 +205,7 @@ struct kvm_hyperv_exit {
>  #define KVM_EXIT_S390_STSI        25
>  #define KVM_EXIT_IOAPIC_EOI       26
>  #define KVM_EXIT_HYPERV           27
> +#define KVM_EXIT_DIRTY_LOG_FULL   28
>
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index bff980c..00d7989 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -270,6 +270,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>                 }
>                 vcpu->dirty_logs = page_address(page);
>         }
> +       vcpu->need_exit = false;
>  #endif
>
>         kvm_vcpu_set_in_spin_loop(vcpu, false);
> @@ -3030,6 +3031,29 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  }
>
>  #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +static void kvm_mt_dirty_log_full(struct kvm *kvm, struct kvm_vcpu *vcpu)
> +{
> +       /*
> +        * Request vcpu exits, but if interrupts are disabled, we have
> +        * to defer the requests because smp_call_xxx may deadlock when
> +        * called that way.
> +        */
> +       if (vcpu && irqs_disabled()) {
> +               kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, vcpu);
> +               vcpu->need_exit = true;
> +       } else {
> +               WARN_ON(irqs_disabled());
> +               kvm_make_all_cpus_request(kvm,
> +                                         KVM_REQ_EXIT_DIRTY_LOG_FULL);
> +       }
> +}
> +
> +/*
> + * estimated number of pages being dirtied during vcpu exit, not counting
> + * hardware dirty log (PML) flush
> + */
> +#define KVM_MT_DIRTY_PAGE_NUM_EXTRA 128
> +
>  void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>         struct kvm_vcpu *vcpu, gfn_t gfn)
>  {
> @@ -3037,6 +3061,7 @@ void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>         int slot_id;
>         u32 as_id = 0;
>         u64 offset;
> +       u32 extra = KVM_MT_DIRTY_PAGE_NUM_EXTRA;
>
>         if (!slot || !slot->dirty_bitmap || !kvm->dirty_log_size)
>                 return;
> @@ -3068,6 +3093,17 @@ void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>         gfnlist->dirty_gfns[gfnlist->dirty_index].offset = offset;
>         smp_wmb();
>         gfnlist->dirty_index++;
> +
> +       /*
> +        * more pages will be dirtied during vcpu exit, e.g. pml log
> +        * being flushed. So allow some buffer space.
> +        */
> +       if (vcpu)
> +               extra += kvm_mt_cpu_dirty_log_size();
> +
> +       if (gfnlist->dirty_index == (kvm->max_dirty_logs - extra))
> +               kvm_mt_dirty_log_full(kvm, vcpu);
> +
>         if (!vcpu)
>                 spin_unlock(&kvm->dirty_log_lock);
>  }
> --
> 2.5.0
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Jan. 5, 2017, 10:15 a.m. UTC | #2
On 04/01/2017 21:44, Cao, Lei wrote:
> Implement dirty list full forcing vcpus to exit.
> 
> Signed-off-by: Lei Cao <lei.cao@stratus.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 +++++++
>  arch/x86/kvm/mmu.c              |  7 +++++++
>  arch/x86/kvm/vmx.c              |  7 +++++++
>  arch/x86/kvm/x86.c              | 10 ++++++++++
>  include/linux/kvm_host.h        |  1 +
>  include/uapi/linux/kvm.h        |  1 +
>  virt/kvm/kvm_main.c             | 36 ++++++++++++++++++++++++++++++++++++
>  7 files changed, 69 insertions(+)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5707129..e2f4cee 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6714,6 +6714,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  		 */
>  		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
>  			kvm_hv_process_stimers(vcpu);
> +		if (kvm_check_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, vcpu)) {
> +			vcpu->run->exit_reason = KVM_EXIT_DIRTY_LOG_FULL;
> +			r = -EINTR;
> +			if (vcpu->need_exit) {
> +				vcpu->need_exit = false;
> +				kvm_make_all_cpus_request(vcpu->kvm,
> +					KVM_REQ_EXIT_DIRTY_LOG_FULL);
> +			}
> +			goto out;
> +		}
>  	}
>  
>  	/*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7a85b30..b7fedeb 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -283,6 +283,7 @@ struct kvm_vcpu {
>  	struct dentry *debugfs_dentry;
>  #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
>  	struct gfn_list_t *dirty_logs;
> +	bool need_exit;
>  #endif
>  };
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 05332de..bacb8db 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -205,6 +205,7 @@ struct kvm_hyperv_exit {
>  #define KVM_EXIT_S390_STSI        25
>  #define KVM_EXIT_IOAPIC_EOI       26
>  #define KVM_EXIT_HYPERV           27
> +#define KVM_EXIT_DIRTY_LOG_FULL   28
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index bff980c..00d7989 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -270,6 +270,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>  		}
>  		vcpu->dirty_logs = page_address(page);
>  	}
> +	vcpu->need_exit = false;
>  #endif
>  
>  	kvm_vcpu_set_in_spin_loop(vcpu, false);
> @@ -3030,6 +3031,29 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  }
>  
>  #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +static void kvm_mt_dirty_log_full(struct kvm *kvm, struct kvm_vcpu *vcpu)
> +{
> +	/*
> +	 * Request vcpu exits, but if interrupts are disabled, we have
> +	 * to defer the requests because smp_call_xxx may deadlock when
> +	 * called that way.
> +	 */
> +	if (vcpu && irqs_disabled()) {
> +		kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, vcpu);
> +		vcpu->need_exit = true;
> +	} else {
> +		WARN_ON(irqs_disabled());
> +		kvm_make_all_cpus_request(kvm,
> +					  KVM_REQ_EXIT_DIRTY_LOG_FULL);
> +	}

Please add a tracepoint here.  Also, why exit all CPUs if the VCPU log
is full?  Let userspace sort that out instead; this also removes the
need for vcpu->need_exit.

I haven't checked that you cannot reach the case of !vcpu &&
irqs_disabled().  Please document your reasoning here.

> +}
> +
> +/*
> + * estimated number of pages being dirtied during vcpu exit, not counting
> + * hardware dirty log (PML) flush
> + */
> +#define KVM_MT_DIRTY_PAGE_NUM_EXTRA 128
> +
>  void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	struct kvm_vcpu *vcpu, gfn_t gfn)
>  {
> @@ -3037,6 +3061,7 @@ void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	int slot_id;
>  	u32 as_id = 0;
>  	u64 offset;
> +	u32 extra = KVM_MT_DIRTY_PAGE_NUM_EXTRA;
>  
>  	if (!slot || !slot->dirty_bitmap || !kvm->dirty_log_size)
>  		return;
> @@ -3068,6 +3093,17 @@ void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	gfnlist->dirty_gfns[gfnlist->dirty_index].offset = offset;
>  	smp_wmb();
>  	gfnlist->dirty_index++;
> +
> +	/*
> +	 * more pages will be dirtied during vcpu exit, e.g. pml log
> +	 * being flushed. So allow some buffer space.
> +	 */
> +	if (vcpu)
> +		extra += kvm_mt_cpu_dirty_log_size();

Please make "extra" a field in struct kvm, so that it is computed just
once.  Also, how many pages will be dirtied?  Will they only be in the
global log or also in the per-vcpu logs?  Perhaps there should be
separate "extra"s for the global and vcpu logs.

Thanks,

Paolo

> +	if (gfnlist->dirty_index == (kvm->max_dirty_logs - extra))
> +		kvm_mt_dirty_log_full(kvm, vcpu);
> +
>  	if (!vcpu)
>  		spin_unlock(&kvm->dirty_log_lock);
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6dfb14a..20a9fc8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -75,6 +75,7 @@ 
 #define KVM_REQ_HV_RESET          28
 #define KVM_REQ_HV_EXIT           29
 #define KVM_REQ_HV_STIMER         30
+#define KVM_REQ_EXIT_DIRTY_LOG_FULL 31
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -997,6 +998,8 @@  struct kvm_x86_ops {
 	 *  - enable_log_dirty_pt_masked:
 	 *	called when reenabling log dirty for the GFNs in the mask after
 	 *	corresponding bits are cleared in slot->dirty_bitmap.
+	 *  - cpu_dirty_log_size:
+	 *      called to inquire about the size of the hardware dirty log
 	 */
 	void (*slot_enable_log_dirty)(struct kvm *kvm,
 				      struct kvm_memory_slot *slot);
@@ -1006,6 +1009,8 @@  struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	int (*cpu_dirty_log_size)(void);
+
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
 
@@ -1388,6 +1393,8 @@  bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
 
+int kvm_mt_cpu_dirty_log_size(void);
+
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
 	if (kvm_x86_ops->vcpu_blocking)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4..e0668a0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4980,6 +4980,13 @@  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 	}
 }
 
+int kvm_mt_cpu_dirty_log_size(void)
+{
+	if (kvm_x86_ops->cpu_dirty_log_size)
+		return kvm_x86_ops->cpu_dirty_log_size();
+	return 0;
+}
+
 static unsigned long
 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ba20b00..76f88b0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6729,6 +6729,7 @@  static __init int hardware_setup(void)
 		kvm_x86_ops->slot_disable_log_dirty = NULL;
 		kvm_x86_ops->flush_log_dirty = NULL;
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+		kvm_x86_ops->cpu_dirty_log_size = NULL;
 	}
 
 	if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
@@ -11503,6 +11504,11 @@  static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEATURE_CONTROL_LMCE;
 }
 
+static int vmx_cpu_dirty_log_size(void)
+{
+	return PML_ENTITY_NUM;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -11617,6 +11623,7 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
 
 	.pre_block = vmx_pre_block,
 	.post_block = vmx_post_block,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5707129..e2f4cee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6714,6 +6714,16 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		 */
 		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
 			kvm_hv_process_stimers(vcpu);
+		if (kvm_check_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_DIRTY_LOG_FULL;
+			r = -EINTR;
+			if (vcpu->need_exit) {
+				vcpu->need_exit = false;
+				kvm_make_all_cpus_request(vcpu->kvm,
+					KVM_REQ_EXIT_DIRTY_LOG_FULL);
+			}
+			goto out;
+		}
 	}
 
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7a85b30..b7fedeb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -283,6 +283,7 @@  struct kvm_vcpu {
 	struct dentry *debugfs_dentry;
 #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
 	struct gfn_list_t *dirty_logs;
+	bool need_exit;
 #endif
 };
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05332de..bacb8db 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -205,6 +205,7 @@  struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI        25
 #define KVM_EXIT_IOAPIC_EOI       26
 #define KVM_EXIT_HYPERV           27
+#define KVM_EXIT_DIRTY_LOG_FULL   28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bff980c..00d7989 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -270,6 +270,7 @@  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 		}
 		vcpu->dirty_logs = page_address(page);
 	}
+	vcpu->need_exit = false;
 #endif
 
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
@@ -3030,6 +3031,29 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 }
 
 #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void kvm_mt_dirty_log_full(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Request vcpu exits, but if interrupts are disabled, we have
+	 * to defer the requests because smp_call_xxx may deadlock when
+	 * called that way.
+	 */
+	if (vcpu && irqs_disabled()) {
+		kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, vcpu);
+		vcpu->need_exit = true;
+	} else {
+		WARN_ON(irqs_disabled());
+		kvm_make_all_cpus_request(kvm,
+					  KVM_REQ_EXIT_DIRTY_LOG_FULL);
+	}
+}
+
+/*
+ * estimated number of pages being dirtied during vcpu exit, not counting
+ * hardware dirty log (PML) flush
+ */
+#define KVM_MT_DIRTY_PAGE_NUM_EXTRA 128
+
 void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
 	struct kvm_vcpu *vcpu, gfn_t gfn)
 {
@@ -3037,6 +3061,7 @@  void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
 	int slot_id;
 	u32 as_id = 0;
 	u64 offset;
+	u32 extra = KVM_MT_DIRTY_PAGE_NUM_EXTRA;
 
 	if (!slot || !slot->dirty_bitmap || !kvm->dirty_log_size)
 		return;
@@ -3068,6 +3093,17 @@  void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
 	gfnlist->dirty_gfns[gfnlist->dirty_index].offset = offset;
 	smp_wmb();
 	gfnlist->dirty_index++;
+
+	/*
+	 * more pages will be dirtied during vcpu exit, e.g. pml log
+	 * being flushed. So allow some buffer space.
+	 */
+	if (vcpu)
+		extra += kvm_mt_cpu_dirty_log_size();
+
+	if (gfnlist->dirty_index == (kvm->max_dirty_logs - extra))
+		kvm_mt_dirty_log_full(kvm, vcpu);
+
 	if (!vcpu)
 		spin_unlock(&kvm->dirty_log_lock);
 }