diff mbox

[v3,5/5] KVM: Implement ring-based dirty memory tracking

Message ID CY1PR08MB199218484C74449CF8281CF4F04F0@CY1PR08MB1992.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei Feb. 3, 2017, 8:06 p.m. UTC
Implement ring-base dirty memory tracking.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
---
 arch/x86/kvm/Makefile    |   3 +-
 include/linux/kvm_host.h |  14 ++++
 virt/kvm/gfn_ring.c      | 100 +++++++++++++++++++++++
 virt/kvm/kvm_main.c      | 209 ++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 313 insertions(+), 13 deletions(-)
 create mode 100644 virt/kvm/gfn_ring.c

Comments

Paolo Bonzini Feb. 4, 2017, 7:01 a.m. UTC | #1
Thanks, this is much nicer.  I have another generic comment on the ring
buffer implementation.

If you make the size a power of two, you can use free running counters,
i.e. avoid the modulo or mask operation every time you write them.

For example

	reset_index = 0, dirty_index = 0 -> 0 items
	reset_index = 0, dirty_index = 512 -> 512 items
	reset_index = 1, dirty_index = 512 -> 511 items

This also removes the need to leave a free item in the ring buffer.  Of
course KVM_ENABLE_CAP_VM then needs to check that the size is a power of
two.

Also, please add some comments to kvm_gfn_ring.h's definition of struct
kvm_gfn_ring.  The ring buffer is split between struct kvm_dirty_list
and struct kvm_gfn_ring, so it's nice to have some documentation and
a reference to struct kvm_dirty_list.  Feel free to harvest my review of
v2 for some text.

On 03/02/2017 12:06, Cao, Lei wrote:
> +	cur_slot = READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot);
> +	cur_offset = READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset);
> +	mask = 1;
> +	count++;
> +	gfnring->reset_index = (gfnring->reset_index + 1) % gfnring->size;

This "(x + 1) % size" recurs often.

Let's make the size a power of two and then write it like

	struct kvm_dirty_gfn *entry;
	...
	entry = &list->dirty_gfns[gfnring->reset_index
				  & (gfnring->size - 1)];

	/*
	 * The ring buffer is shared with userspace, which might mmap
	 * it and concurrently modify slot and offset.  Userspace must
	 * not be trusted!  READ_ONCE prevents the compiler from
	 * the values after they've been range-checked (the checks are
	 * in kvm_reset_dirty_gfn).
	 */
	smp_read_barrier_depends();
	cur_slot = READ_ONCE(entry->slot);
	cur_offset = READ_ONCE(entry->offset);
	gfnring->reset_index++;

> +	while (gfnring->reset_index != fetch) {
> +		next_slot =
> +		  READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot);
> +		next_offset =
> +		  READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset);
> +		if ((next_slot != cur_slot) ||
> +		    (next_offset < cur_offset) ||
> +		    ((next_offset - cur_offset) > (BITS_PER_LONG - 1))) {

No extra parentheses please.  Some optimization and cleanup is also
possible:

	entry = &list->dirty_gfns[gfnring->reset_index
				  & (gfnring->size - 1)];
	smp_read_barrier_depends();
	next_slot = READ_ONCE(entry->slot);
	next_offset = READ_ONCE(entry->offset);
	gfnring->reset_index++;

	/*
	 * Try to coalesce the reset operations when the guest is
	 * scanning pages in the same slot.
	 */
	if (next_slot == cur_slot) {
		int delta = next_offset - cur_offset;
		if (delta >= 0 && delta < BITS_PER_LONG) {
			mask |= 1ull << delta;
			continue;
		}

		/* Backwards visit, careful about overflows!  */
		if (delta >= -BITS_PER_LONG && delta < 0 &&
		    (mask << -delta >> -delta) == mask) {
			cur_offset = next_offset;
			mask = (mask << -delta) | 1;
			continue;
		}
	}

	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
	cur_slot = next_slot;
	cur_offset = next_offset;
	mask = 1;

This should handle backwards writes in the guest more effectively (e.g.
large memmove).  Maybe not super common, but easy enough to handle.

> +			kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> +			cur_slot = next_slot;
> +			cur_offset = next_offset;
> +			mask = 1;
> +		} else
> +			mask |= (u64)1 << (next_offset - cur_offset);
> +		count++;
> +		gfnring->reset_index = (gfnring->reset_index + 1) %
> +					gfnring->size;
> +	}
> +	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> +

[...]

> +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
> +	if (kvm->dirty_ring_size) {
> +		r = kvm_gfn_ring_alloc(&vcpu->dirty_ring,
> +				       kvm->dirty_ring_size);
> +		if (r) {
> +			kvm->dirty_ring_size = 0;
> +			goto fail_free_run;
> +		}
> +		vcpu->max_dirty_logs =
> +			(kvm->dirty_ring_size/sizeof(struct kvm_dirty_gfn))
> +			- 1 - kvm_cpu_dirty_log_size();
> +	}
> +#endif


> +
> +	kvm->dirty_ring_size = size;
> +	kvm->max_dirty_logs = (size/sizeof(struct kvm_dirty_gfn)) - 1;

Should there be some legroom, in case KVM writes several pages
consecutively?  I would feel safer if you had perhaps 16 empty pages
when exiting to the guest (and then "if (num == max)" becomes "if (num >
max)".

> 
> +	if (vcpu) {
> +		gfnlist = &vcpu->dirty_ring;
> +		max = vcpu->max_dirty_logs;
> +	} else {
> +		gfnlist = &kvm->dirty_ring;
> +		max = kvm->max_dirty_logs;
> +	}

This suggests making the soft limit (max_dirty_logs) an argument of
kvm_gfn_ring_alloc.  kvm_gfn_ring_push can return 0 if the soft limit is
not reached, 1 if it is, and -EBUSY if the hard limit is reached.

> 
> +
> +	if (((gfnring->dirty_index + 1) % gfnring->size) ==
> +	    gfnring->reset_index)
> +		return -EBUSY;

The spin lock can be taken in the error case too.  It makes it simpler
to understand the code, and the error case is obviously not hot.

> +	if (locked)
> +		spin_lock(&gfnring->lock);
> +
> +	list->dirty_gfns[gfnring->dirty_index].slot = slot;
> +	list->dirty_gfns[gfnring->dirty_index].offset = offset;
> +	smp_wmb();
> +	gfnring->dirty_index = (gfnring->dirty_index+1) % gfnring->size;
> +	num = (gfnring->dirty_index - gfnring->reset_index) % gfnring->size;

Using the free running counter technique, the invariants are clearer and
modulo operations do not clutter the code as much:

	if (locked)
		spin_lock(&gfnring->lock);

	num = (u16)(gfnring->dirty_index - gfnring->reset_index);
	if (num >= gfnring->size) {
		WARN_ON_ONCE(num > gfnring->size);
		r = -EBUSY;
		goto out;
	}

	entry = &list->dirty_gfns[gfnring->dirty_index
				  & (gfnring->size - 1)];
	entry->slot = slot;
	entry->offset = offset;
	smp_wmb();
	gfnring->dirty_index++;
	num++;
	r = num >= gfnring->soft_limit;
out:
	if (locked)
		spin_unlock(&gfnring->lock);

	return r;

Paolo
Paolo Bonzini Feb. 4, 2017, 7:12 a.m. UTC | #2
On 03/02/2017 12:06, Cao, Lei wrote:
> +struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i)
> +{
> +	return vmalloc_to_page((void *)ring->dirty_list+i*PAGE_SIZE);
> +
> +}

Oops, you need range checking here.  Otherwise you have a gaping kernel
memory leak! :)

Paolo
diff mbox

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3bff207..d832622 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -8,7 +8,8 @@  CFLAGS_vmx.o := -I.
 KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+				$(KVM)/gfn_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 65561bf..7fd045b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,7 @@ 
 #include <linux/kvm_types.h>
 
 #include <asm/kvm_host.h>
+#include <linux/kvm_gfn_ring.h>
 
 #ifndef KVM_MAX_VCPU_ID
 #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -265,6 +266,11 @@  struct kvm_vcpu {
 	bool preempted;
 	struct kvm_vcpu_arch arch;
 	struct dentry *debugfs_dentry;
+
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	u32 max_dirty_logs;
+	struct kvm_gfn_ring dirty_ring;
+#endif
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -430,6 +436,12 @@  struct kvm {
 	struct list_head devices;
 	struct dentry *debugfs_dentry;
 	struct kvm_stat_data **debugfs_stat_data;
+
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	u32 dirty_ring_size;
+	u32 max_dirty_logs;
+	struct kvm_gfn_ring dirty_ring;
+#endif
 };
 
 #define kvm_err(fmt, ...) \
@@ -713,6 +725,8 @@  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					gfn_t gfn_offset,
 					unsigned long mask);
 
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
+
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
diff --git a/virt/kvm/gfn_ring.c b/virt/kvm/gfn_ring.c
new file mode 100644
index 0000000..5d4977c
--- /dev/null
+++ b/virt/kvm/gfn_ring.c
@@ -0,0 +1,100 @@ 
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_gfn_ring.h>
+
+int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, u32 size)
+{
+	gfnring->dirty_list = vmalloc(size);
+	if (!gfnring->dirty_list)
+		return -ENOMEM;
+	memset(gfnring->dirty_list, 0, size);
+
+	gfnring->size = size/sizeof(struct kvm_dirty_gfn);
+	gfnring->dirty_index = 0;
+	gfnring->reset_index = 0;
+	spin_lock_init(&gfnring->lock);
+
+	return 0;
+}
+
+int kvm_gfn_ring_reset(struct kvm *kvm, struct kvm_gfn_ring *gfnring)
+{
+	u32 cur_slot, next_slot;
+	u64 cur_offset, next_offset;
+	unsigned long mask = 0;
+	u32 fetch;
+	int count = 0;
+	struct kvm_dirty_list *list = gfnring->dirty_list;
+
+	fetch = READ_ONCE(list->indices.fetch_index);
+	if (fetch == gfnring->reset_index)
+		return 0;
+
+	cur_slot = READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot);
+	cur_offset = READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset);
+	mask = 1;
+	count++;
+	gfnring->reset_index = (gfnring->reset_index + 1) % gfnring->size;
+	while (gfnring->reset_index != fetch) {
+		next_slot =
+		  READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot);
+		next_offset =
+		  READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset);
+		if ((next_slot != cur_slot) ||
+		    (next_offset < cur_offset) ||
+		    ((next_offset - cur_offset) > (BITS_PER_LONG - 1))) {
+			kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+			cur_slot = next_slot;
+			cur_offset = next_offset;
+			mask = 1;
+		} else
+			mask |= (u64)1 << (next_offset - cur_offset);
+		count++;
+		gfnring->reset_index = (gfnring->reset_index + 1) %
+					gfnring->size;
+	}
+	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+	return count;
+}
+
+int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring,
+		      u32 slot,
+		      u64 offset,
+		      bool locked)
+{
+	int num;
+	struct kvm_dirty_list *list = gfnring->dirty_list;
+
+	if (((gfnring->dirty_index + 1) % gfnring->size) ==
+	    gfnring->reset_index)
+		return -EBUSY;
+
+	if (locked)
+		spin_lock(&gfnring->lock);
+
+	list->dirty_gfns[gfnring->dirty_index].slot = slot;
+	list->dirty_gfns[gfnring->dirty_index].offset = offset;
+	smp_wmb();
+	gfnring->dirty_index = (gfnring->dirty_index+1) % gfnring->size;
+	num = (gfnring->dirty_index - gfnring->reset_index) % gfnring->size;
+	list->indices.avail_index = gfnring->dirty_index;
+
+	if (locked)
+		spin_unlock(&gfnring->lock);
+
+	return num;
+}
+
+struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i)
+{
+	return vmalloc_to_page((void *)ring->dirty_list+i*PAGE_SIZE);
+
+}
+
+void kvm_gfn_ring_free(struct kvm_gfn_ring *gfnring)
+{
+	if (gfnring->dirty_list)
+		vfree(gfnring->dirty_list);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 417c0ff..569343f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -63,6 +63,8 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+#include <linux/kvm_gfn_ring.h>
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
@@ -121,7 +123,16 @@  static void hardware_disable_all(void);
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
 static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+				    struct kvm_vcpu *vcpu,
+				    struct kvm_memory_slot *memslot,
+				    gfn_t gfn);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+				    struct kvm_vcpu *vcpu,
+				    struct kvm_memory_slot *slot,
+				    gfn_t gfn);
+#endif
 
 __visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -258,11 +269,34 @@  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->preempted = false;
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	if (kvm->dirty_ring_size) {
+		r = kvm_gfn_ring_alloc(&vcpu->dirty_ring,
+				       kvm->dirty_ring_size);
+		if (r) {
+			kvm->dirty_ring_size = 0;
+			goto fail_free_run;
+		}
+		vcpu->max_dirty_logs =
+			(kvm->dirty_ring_size/sizeof(struct kvm_dirty_gfn))
+			- 1 - kvm_cpu_dirty_log_size();
+	}
+#endif
+
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+		goto fail_free_ring;
+#else
 		goto fail_free_run;
+#endif
 	return 0;
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+fail_free_ring:
+	if (kvm->dirty_ring_size)
+		kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
 fail_free_run:
 	free_page((unsigned long)vcpu->run);
 fail:
@@ -275,6 +309,10 @@  void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 	put_pid(vcpu->pid);
 	kvm_arch_vcpu_uninit(vcpu);
 	free_page((unsigned long)vcpu->run);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	if (vcpu->kvm->dirty_ring_size)
+		kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 
@@ -726,6 +764,10 @@  static void kvm_destroy_vm(struct kvm *kvm)
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kvm_io_bus_destroy(kvm->buses[i]);
 	kvm_coalesced_mmio_free(kvm);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	if (kvm->dirty_ring_size)
+		kvm_gfn_ring_free(&kvm->dirty_ring);
+#endif
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 #else
@@ -1861,7 +1903,8 @@  int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
 
-static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_vcpu *vcpu,
+				  struct kvm_memory_slot *memslot, gfn_t gfn,
 			          const void *data, int offset, int len)
 {
 	int r;
@@ -1873,7 +1916,7 @@  static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
 	r = __copy_to_user((void __user *)addr + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, vcpu, memslot, gfn);
 	return 0;
 }
 
@@ -1882,7 +1925,8 @@  int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(kvm, NULL, slot, gfn, data,
+				      offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 
@@ -1891,7 +1935,8 @@  int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(vcpu->kvm, vcpu, slot, gfn, data,
+				      offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
 
@@ -1995,7 +2040,7 @@  int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cac
 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(v->kvm, v, ghc->memslot, gpa >> PAGE_SHIFT);
 
 	return 0;
 }
@@ -2060,12 +2105,17 @@  int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+				    struct kvm_vcpu *vcpu,
+				    struct kvm_memory_slot *memslot,
 				    gfn_t gfn)
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+		mark_page_dirty_in_ring(kvm, vcpu, memslot, gfn);
+#endif
 		set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
@@ -2075,7 +2125,7 @@  void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = gfn_to_memslot(kvm, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, NULL, memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
 
@@ -2084,7 +2134,7 @@  void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(vcpu->kvm, vcpu, memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
@@ -2363,6 +2413,11 @@  static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	else if (vmf->pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET)
+		page = kvm_gfn_ring_get_page(&vcpu->dirty_ring,
+				vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
+#endif
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
@@ -2946,14 +3001,118 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 }
 
 #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
-static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, __u32 size)
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+				    struct kvm_vcpu *vcpu,
+				    struct kvm_memory_slot *slot,
+				    gfn_t gfn)
 {
-	return -EINVAL;
+	struct kvm_gfn_ring *gfnlist;
+	u32 as_id = 0;
+	u64 offset;
+	struct kvm_vcpu *exit_vcpu = vcpu;
+	int num;
+	bool locked;
+	u32 max;
+
+	if (!kvm->dirty_ring_size)
+		return;
+
+	offset = gfn - slot->base_gfn;
+
+	if (test_bit_le(offset, slot->dirty_bitmap))
+		return;
+
+	if (vcpu)
+		as_id = kvm_arch_vcpu_memslots_id(vcpu);
+
+	locked = (vcpu == NULL);
+
+	if (vcpu) {
+		gfnlist = &vcpu->dirty_ring;
+		max = vcpu->max_dirty_logs;
+	} else {
+		gfnlist = &kvm->dirty_ring;
+		max = kvm->max_dirty_logs;
+	}
+
+	num = kvm_gfn_ring_push(gfnlist, (as_id << 16)|slot->id,
+		offset, locked);
+	if (num < 0) {
+		if (vcpu)
+			WARN_ONCE(1, "vcpu %d dirty log overflow\n",
+				vcpu->vcpu_id);
+		else
+			WARN_ONCE(1, "global dirty log overflow\n");
+		return;
+	}
+
+	if (num == max) {
+		if (!exit_vcpu)
+			exit_vcpu = kvm->vcpus[0];
+		kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, exit_vcpu);
+	}
+}
+
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+	struct kvm_memory_slot *memslot;
+	int as_id, id;
+
+	as_id = slot >> 16;
+	id = (u16)slot;
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+	if (offset >= memslot->npages)
+		return;
+
+	spin_lock(&kvm->mmu_lock);
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+	spin_unlock(&kvm->mmu_lock);
+
+	while (mask) {
+		clear_bit_le(offset + __ffs(mask), memslot->dirty_bitmap);
+		mask &= mask - 1;
+	}
+}
+
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+	int r;
+
+	kvm->dirty_ring_size = size;
+	kvm->max_dirty_logs = (size/sizeof(struct kvm_dirty_gfn)) - 1;
+	r = kvm_gfn_ring_alloc(&kvm->dirty_ring, size);
+	if (r) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+	return 0;
 }
 
 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
 {
-	return -EINVAL;
+	int i;
+	struct kvm_vcpu *vcpu;
+	int cleared = 0;
+
+	if (!kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		cleared += kvm_gfn_ring_reset(kvm, &vcpu->dirty_ring);
+
+	cleared += kvm_gfn_ring_reset(kvm, &kvm->dirty_ring);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return cleared;
 }
 #endif
 
@@ -3202,6 +3361,29 @@  static long kvm_vm_compat_ioctl(struct file *filp,
 }
 #endif
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvm *kvm = vma->vm_file->private_data;
+	struct page *page;
+
+	page = kvm_gfn_ring_get_page(&kvm->dirty_ring, vmf->pgoff);
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_vm_vm_ops = {
+	.fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vm_vm_ops;
+	return 0;
+}
+#endif
+
 static struct file_operations kvm_vm_fops = {
 	.release        = kvm_vm_release,
 	.unlocked_ioctl = kvm_vm_ioctl,
@@ -3209,6 +3391,9 @@  static struct file_operations kvm_vm_fops = {
 	.compat_ioctl   = kvm_vm_compat_ioctl,
 #endif
 	.llseek		= noop_llseek,
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	.mmap           = kvm_vm_mmap,
+#endif
 };
 
 static int kvm_dev_ioctl_create_vm(unsigned long type)