diff mbox

[v3,5/6] KVM: Implement ring-based dirty memory tracking

Message ID BYAPR08MB397312D2071AA41384E8A99AF0710@BYAPR08MB3973.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei June 18, 2018, 1:22 p.m. UTC
KVM currently uses large bitmaps to track dirty memory.  These bitmaps
are copied to userspace when userspace queries KVM for its dirty page
information. The use of bitmaps is sufficient for live-migration
method, as large parts of memory are be dirtied from one log-dirty
pass to another.  However, in a checkpointing system, the number of
dirty pages is small and in fact it is often bounded---the VM is paused
when it has dirtied a pre-defined number of pages. Traversing a large,
sparsely populated bitmap to find set bits is time-consuming, as is
copying the bitmap to user-space.

The preferred data structure for performant checkpointing solutions is
a dense list of guest frame numbers (GFN). This patch series stores
the dirty list in kernel memory that can be memory mapped into
userspace to allow speedy harvesting.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/Makefile        |   3 +-
 include/linux/kvm_gfn_ring.h |  68 +++++++++++++++
 include/linux/kvm_host.h     |  12 +++
 virt/kvm/gfn_ring.c          | 135 +++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c          | 200 ++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 415 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/kvm_gfn_ring.h
 create mode 100644 virt/kvm/gfn_ring.c
diff mbox

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index dc4f2fd..19fdd31 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,7 +9,8 @@  CFLAGS_vmx.o := -I.
 KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+				$(KVM)/gfn_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/include/linux/kvm_gfn_ring.h b/include/linux/kvm_gfn_ring.h
new file mode 100644
index 0000000..9d5ca99
--- /dev/null
+++ b/include/linux/kvm_gfn_ring.h
@@ -0,0 +1,68 @@ 
+#ifndef KVM_GFN_RING_H
+#define KVM_GFN_RING_H
+
+/*
+ * struct kvm_dirty_ring is defined in include/uapi/linux/kvm.h.
+ *
+ * dirty_ring:  shared with userspace via mmap. dirty_ring->dirty_gfns
+ *              is the compact list that holds the dirty pages.
+ * dirty_index: free running counter that points to the next slot in
+ *              dirty_ring->dirty_gfns  where a new dirty page should go.
+ * reset_index: free running counter that points to the next dirty page
+ *              in dirty_ring->dirty_gfns for which dirty trap needs to
+ *              be reenabled
+ * size:        size of the compact list, dirty_ring->dirty_gfns
+ * soft_limit:  when the number of dirty pages in the list reaches this
+ *              limit, vcpu that owns this ring should exit to userspace
+ *              to allow userspace to harvest all the dirty pages
+ * lock:        protects dirty_ring, only in use if this is the global
+ *              ring
+ *
+ * The number of dirty pages in the ring is calculated by,
+ * dirty_index - reset_index
+ *
+ * kernel increments dirty_ring->indices.avail_index after dirty index
+ * is incremented. When userspace harvests the dirty pages, it increments
+ * dirty_ring->indices.fetch_index up to dirty_ring->indices.avail_index.
+ * When kernel reenables dirty traps for the dirty pages, it increments
+ * reset_index up to dirty_ring->indices.fetch_index.
+ *
+ */
+struct kvm_gfn_ring {
+	u16 dirty_index;
+	u16 reset_index;
+	u32 size;
+	u32 soft_limit;
+	spinlock_t lock;
+	struct kvm_dirty_ring *dirty_ring;
+};
+
+int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring,
+		       u32 size,
+		       u32 limit);
+
+/*
+ * called with kvm->slots_lock held, returns the number of
+ * processed pages.
+ */
+int kvm_gfn_ring_reset(struct kvm *kvm,
+		       struct kvm_gfn_ring *gfnring);
+
+/*
+ * returns 0: successfully pushed
+ *         1: successfully pushed, soft limit reached,
+ *            vcpu should exit to userspace
+ *         -EBUSY: unable to push, dirty ring full.
+ */
+int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring,
+		      u32 slot,
+		      u64 offset,
+		      bool locked);
+
+/* for use in vm_operations_struct */
+struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring,
+				   u32 i);
+
+void kvm_gfn_ring_free(struct kvm_gfn_ring *ring);
+
+#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11e891a..feee06c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,7 @@ 
 #include <linux/kvm_types.h>
 
 #include <asm/kvm_host.h>
+#include <linux/kvm_gfn_ring.h>
 
 #ifndef KVM_MAX_VCPU_ID
 #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -126,6 +127,7 @@  static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_PENDING_TIMER     2
 #define KVM_REQ_UNHALT            3
+#define KVM_REQ_EXIT_DIRTY_LOG_FULL 4
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -274,6 +276,10 @@  struct kvm_vcpu {
 	bool preempted;
 	struct kvm_vcpu_arch arch;
 	struct dentry *debugfs_dentry;
+
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	struct kvm_gfn_ring dirty_ring;
+#endif
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -441,6 +447,10 @@  struct kvm {
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	u32 dirty_ring_size;
+	struct kvm_gfn_ring dirty_ring;
+#endif
 	struct dentry *debugfs_dentry;
 	struct kvm_stat_data **debugfs_stat_data;
 	struct srcu_struct srcu;
@@ -751,6 +761,8 @@  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					gfn_t gfn_offset,
 					unsigned long mask);
 
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
+
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
diff --git a/virt/kvm/gfn_ring.c b/virt/kvm/gfn_ring.c
new file mode 100644
index 0000000..cb0f455
--- /dev/null
+++ b/virt/kvm/gfn_ring.c
@@ -0,0 +1,135 @@ 
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_gfn_ring.h>
+
+int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, u32 size, u32 limit)
+{
+	gfnring->dirty_ring = vmalloc(size);
+	if (!gfnring->dirty_ring)
+		return -ENOMEM;
+	memset(gfnring->dirty_ring, 0, size);
+
+	gfnring->size = size/sizeof(struct kvm_dirty_gfn);
+	gfnring->soft_limit = limit;
+	gfnring->dirty_index = 0;
+	gfnring->reset_index = 0;
+	spin_lock_init(&gfnring->lock);
+
+	return 0;
+}
+
+int kvm_gfn_ring_reset(struct kvm *kvm, struct kvm_gfn_ring *gfnring)
+{
+	u32 cur_slot, next_slot;
+	u64 cur_offset, next_offset;
+	unsigned long mask;
+	u32 fetch;
+	int count = 0;
+	struct kvm_dirty_gfn *entry;
+	struct kvm_dirty_ring *ring = gfnring->dirty_ring;
+
+	fetch = READ_ONCE(ring->indices.fetch_index);
+	if (fetch == gfnring->reset_index)
+		return 0;
+
+	entry = &ring->dirty_gfns[gfnring->reset_index &
+			(gfnring->size - 1)];
+	/*
+	 * The ring buffer is shared with userspace, which might mmap
+	 * it and concurrently modify slot and offset.  Userspace must
+	 * not be trusted!  READ_ONCE prevents the compiler from changing
+	 * the values after they've been range-checked (the checks are
+	 * in kvm_reset_dirty_gfn).
+	 */
+	smp_read_barrier_depends();
+	cur_slot = READ_ONCE(entry->slot);
+	cur_offset = READ_ONCE(entry->offset);
+	mask = 1;
+	count++;
+	gfnring->reset_index++;
+	while (gfnring->reset_index != fetch) {
+		entry = &ring->dirty_gfns[gfnring->reset_index &
+			(gfnring->size - 1)];
+		smp_read_barrier_depends();
+		next_slot = READ_ONCE(entry->slot);
+		next_offset = READ_ONCE(entry->offset);
+		gfnring->reset_index++;
+		count++;
+		/*
+		 * Try to coalesce the reset operations when the guest is
+		 * scanning pages in the same slot.
+		 */
+		if (next_slot == cur_slot) {
+			int delta = next_offset - cur_offset;
+
+			if (delta >= 0 && delta < BITS_PER_LONG) {
+				mask |= 1ull << delta;
+				continue;
+			}
+
+			/* Backwards visit, careful about overflows!  */
+			if (delta > -BITS_PER_LONG && delta < 0 &&
+			    (mask << -delta >> -delta) == mask) {
+				cur_offset = next_offset;
+				mask = (mask << -delta) | 1;
+				continue;
+			}
+		}
+		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+		cur_slot = next_slot;
+		cur_offset = next_offset;
+		mask = 1;
+	}
+	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+	return count;
+}
+
+int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring,
+		      u32 slot,
+		      u64 offset,
+		      bool locked)
+{
+	int ret;
+	u16 num;
+	struct kvm_dirty_gfn *entry;
+
+	if (locked)
+		spin_lock(&gfnring->lock);
+
+	num = (u16)(gfnring->dirty_index - gfnring->reset_index);
+	if (num >= gfnring->size) {
+		WARN_ON_ONCE(num > gfnring->size);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	entry = &gfnring->dirty_ring->dirty_gfns[gfnring->dirty_index &
+			(gfnring->size - 1)];
+	entry->slot = slot;
+	entry->offset = offset;
+	smp_wmb();
+	gfnring->dirty_index++;
+	num = gfnring->dirty_index - gfnring->reset_index;
+	gfnring->dirty_ring->indices.avail_index = gfnring->dirty_index;
+	ret = num >= gfnring->soft_limit;
+
+out:
+	if (locked)
+		spin_unlock(&gfnring->lock);
+
+	return ret;
+}
+
+struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i)
+{
+	return vmalloc_to_page((void *)ring->dirty_ring+i*PAGE_SIZE);
+
+}
+
+void kvm_gfn_ring_free(struct kvm_gfn_ring *gfnring)
+{
+	if (gfnring->dirty_ring)
+		vfree(gfnring->dirty_ring);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e8b3d98..8d4b6a7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -65,9 +65,16 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+#include <linux/kvm_gfn_ring.h>
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+/* some buffer space for the dirty log ring for ring full situations */
+#define DIRTY_RING_BUFFER_ENTRY_NUM 16
+#endif
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -127,6 +134,12 @@  static void mark_page_dirty_in_slot(struct kvm *kvm,
 				    struct kvm_vcpu *vcpu,
 				    struct kvm_memory_slot *memslot,
 				    gfn_t gfn);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+				    struct kvm_vcpu *vcpu,
+				    struct kvm_memory_slot *slot,
+				    gfn_t gfn);
+#endif
 
 __visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -296,11 +309,36 @@  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->preempted = false;
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	if (kvm->dirty_ring_size) {
+		u32 limit = (kvm->dirty_ring_size /
+			     sizeof(struct kvm_dirty_gfn)) -
+			    DIRTY_RING_BUFFER_ENTRY_NUM -
+			    kvm_cpu_dirty_log_size();
+		r = kvm_gfn_ring_alloc(&vcpu->dirty_ring,
+				       kvm->dirty_ring_size,
+				       limit);
+		if (r) {
+			kvm->dirty_ring_size = 0;
+			goto fail_free_run;
+		}
+	}
+#endif
+
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+		goto fail_free_ring;
+#else
 		goto fail_free_run;
+#endif
 	return 0;
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+fail_free_ring:
+	if (kvm->dirty_ring_size)
+		kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
 fail_free_run:
 	free_page((unsigned long)vcpu->run);
 fail:
@@ -318,6 +356,10 @@  void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 	put_pid(rcu_dereference_protected(vcpu->pid, 1));
 	kvm_arch_vcpu_uninit(vcpu);
 	free_page((unsigned long)vcpu->run);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	if (vcpu->kvm->dirty_ring_size)
+		kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 
@@ -727,6 +769,10 @@  static void kvm_destroy_vm(struct kvm *kvm)
 		kvm->buses[i] = NULL;
 	}
 	kvm_coalesced_mmio_free(kvm);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	if (kvm->dirty_ring_size)
+		kvm_gfn_ring_free(&kvm->dirty_ring);
+#endif
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 #else
@@ -2057,6 +2103,9 @@  static void mark_page_dirty_in_slot(struct kvm *kvm,
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+		mark_page_dirty_in_ring(kvm, vcpu, memslot, gfn);
+#endif
 		set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
@@ -2383,6 +2432,13 @@  static int kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	else if ((vmf->pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
+		 (vmf->pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
+		  vcpu->kvm->dirty_ring_size / PAGE_SIZE))
+		page = kvm_gfn_ring_get_page(&vcpu->dirty_ring,
+				vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
+#endif
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
@@ -2966,14 +3022,128 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 }
 
 #ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+				    struct kvm_vcpu *vcpu,
+				    struct kvm_memory_slot *slot,
+				    gfn_t gfn)
+{
+	struct kvm_gfn_ring *gfnlist;
+	u32 as_id = 0;
+	u64 offset;
+	struct kvm_vcpu *exit_vcpu;
+	struct kvm_vcpu *ring_vcpu;
+	int ret;
+	bool locked = false;
+
+	if (!kvm->dirty_ring_size)
+		return;
+
+	offset = gfn - slot->base_gfn;
+
+	if (test_bit_le(offset, slot->dirty_bitmap))
+		return;
+
+	if (vcpu) {
+		as_id = kvm_arch_vcpu_memslots_id(vcpu);
+		ring_vcpu = vcpu;
+	} else {
+		as_id = 0;
+		ring_vcpu = kvm_get_running_vcpu();
+	}
+
+	if (ring_vcpu) {
+		gfnlist = &ring_vcpu->dirty_ring;
+		exit_vcpu = ring_vcpu;
+	} else {
+		gfnlist = &kvm->dirty_ring;
+		exit_vcpu = kvm->vcpus[0];
+		locked = true;
+	}
+
+	ret = kvm_gfn_ring_push(gfnlist, (as_id << 16)|slot->id,
+				offset, locked);
+	if (ret < 0) {
+		if (vcpu)
+			WARN_ONCE(1, "vcpu %d dirty log overflow\n",
+				vcpu->vcpu_id);
+		else
+			WARN_ONCE(1, "global dirty log overflow\n");
+		return;
+	}
+
+	if (ret)
+		kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, exit_vcpu);
+}
+
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+	struct kvm_memory_slot *memslot;
+	int as_id, id;
+
+	as_id = slot >> 16;
+	id = (u16)slot;
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+	if (offset >= memslot->npages)
+		return;
+
+	spin_lock(&kvm->mmu_lock);
+	/* FIXME: we should use a single AND operation, but there is no
+	 * applicable atomic API.
+	 */
+	while (mask) {
+		clear_bit_le(offset + __ffs(mask), memslot->dirty_bitmap);
+		mask &= mask - 1;
+	}
+
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
 {
-	return -EINVAL;
+	int r;
+	u32 limit;
+
+	/* the size should be power of 2 */
+	if (!size || (size & (size - 1)))
+		return -EINVAL;
+
+	kvm->dirty_ring_size = size;
+	limit = (size/sizeof(struct kvm_dirty_gfn)) -
+		DIRTY_RING_BUFFER_ENTRY_NUM;
+	r = kvm_gfn_ring_alloc(&kvm->dirty_ring, size, limit);
+	if (r) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+	return 0;
 }
 
 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
 {
-	return -EINVAL;
+	int i;
+	struct kvm_vcpu *vcpu;
+	int cleared = 0;
+
+	if (!kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		cleared += kvm_gfn_ring_reset(kvm, &vcpu->dirty_ring);
+
+	cleared += kvm_gfn_ring_reset(kvm, &kvm->dirty_ring);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return cleared;
 }
 #endif
 
@@ -3219,6 +3389,29 @@  static long kvm_vm_compat_ioctl(struct file *filp,
 }
 #endif
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static int kvm_vm_fault(struct vm_fault *vmf)
+{
+	struct kvm *kvm = vmf->vma->vm_file->private_data;
+	struct page *page;
+
+	page = kvm_gfn_ring_get_page(&kvm->dirty_ring, vmf->pgoff);
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_vm_vm_ops = {
+	.fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vm_vm_ops;
+	return 0;
+}
+#endif
+
 static struct file_operations kvm_vm_fops = {
 	.release        = kvm_vm_release,
 	.unlocked_ioctl = kvm_vm_ioctl,
@@ -3226,6 +3419,9 @@  static long kvm_vm_compat_ioctl(struct file *filp,
 	.compat_ioctl   = kvm_vm_compat_ioctl,
 #endif
 	.llseek		= noop_llseek,
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	.mmap           = kvm_vm_mmap,
+#endif
 };
 
 static int kvm_dev_ioctl_create_vm(unsigned long type)