diff mbox

[v3,3/6] KVM: plumb userspace ABI for ring-based dirty memory tracking

Message ID BYAPR08MB3973EE452933E7F770590B19F0710@BYAPR08MB3973.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei June 18, 2018, 1:20 p.m. UTC
Signed-off-by: Cao, Lei <Lei.Cao@stratus.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 96 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          | 33 ++++++++++++++
 virt/kvm/kvm_main.c               | 36 ++++++++++++++-
 3 files changed, 164 insertions(+), 1 deletion(-)
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index fc7fd75..4b82452 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -168,6 +168,7 @@  Based on their initialization different VMs may have different capabilities.
 It is thus encouraged to use the vm ioctl to query for capabilities (available
 with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
 
+
 4.5 KVM_GET_VCPU_MMAP_SIZE
 
 Capability: basic
@@ -180,6 +181,18 @@  The KVM_RUN ioctl (cf.) communicates with userspace via a shared
 memory region.  This ioctl returns the size of that region.  See the
 KVM_RUN documentation for details.
 
+Besides the size of the KVM_RUN communication region, other areas of
+the VCPU file descriptor can be mmap-ed, including:
+
+- if KVM_CAP_COALESCED_MMIO is available, a page at
+  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
+  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
+  KVM_CAP_COALESCED_MMIO is not documented yet.
+
+- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
+  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
+  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
+
 
 4.6 KVM_SET_MEMORY_REGION
 
@@ -4374,3 +4387,86 @@  Parameters: none
 This capability indicates if the flic device will be able to get/set the
 AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
 to discover this without having to create a flic device.
+
+8.14 KVM_CAP_DIRTY_LOG_RING
+
+Architectures: x86
+Parameters: args[0] - size of the dirty log ring
+
+Kernel is capable of tracking dirty memory using rings, which
+are stored in memory regions that can be mmaped into userspace.
+
+There is one dirty ring per vcpu and one global ring.
+
+The dirty ring has the following structure.
+
+struct kvm_dirty_gfn {
+        __u32 pad;
+        __u32 slot; /* as_id | slot_id */
+        __u64 offset;
+};
+
+struct kvm_dirty_ring {
+       union {
+               struct {
+                       __u16 avail_index; /* set by kernel */
+                       __u16 fetch_index; /* set by userspace */
+               } indices;
+               struct kvm_dirty_gfn dirty_gfns[0];
+       };
+};
+
+The two indices in the ring buffer are free running counters.
+They are _not_ limited to the range 0..size-1 where "size" is
+the number of element of the ring buffer.  This makes it easy
+to compute the number of entries in the ring buffer, which is
+simply (u16)(ring->avail_index - ring->fetch_index).
+
+In pseudocode, processing the ring buffer looks like this:
+
+	idx = load-acquire(&ring->fetch_index);
+	while (idx != ring->avail_index) {
+		struct kvm_dirty_gfn *entry;
+		entry = &ring->dirty_gfns[idx & (size - 1)];
+		...
+
+		idx++;
+	}
+	ring->fetch_index = idx;
+
+Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
+to enable this capability for the new guest and set the size of the
+rings. The size of the ring must be a power of two.  The larger the
+ring buffer, the less likely the ring is full and the VM is forced to
+exit to userspace. The optimal size depends on the workload, but it is
+recommended that it be at least 64 KiB (4096 entries).
+
+After the capability is enabled, userspace mmaps the global ring
+buffer from the VM file descriptor.  The per-vcpu dirty ring instead
+is mmapped when the vcpu is created, similar to the kvm_run struct.
+The per-vcpu dirty ring is located at offset KVM_DIRTY_LOG_PAGE_OFFSET *
+PAGE_SIZE of the memory mapped region.
+
+To enable the dirty logging ring buffer, userspace calls
+KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions
+with KVM_MEM_LOG_DIRTY_PAGES bit set.
+
+To disable the dirty logging ring buffer, userspace calls
+KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions
+with KVM_MEM_LOG_DIRTY_PAGES bit clear.
+
+Once the dirty logging is enabled, userspace can start harvesting
+dirty pages.
+
+To harvest the dirty pages, userspace accesses the mmaped ring
+buffer to read the dirty GFNs up to avail_index and set the
+fetch_index accordingly. Harvest can be done when the guest is
+running or paused. Dirty pages don't need to be harvest all at
+once.  To rearm the dirty traps, userspace calls the VM ioctl
+KVM_RESET_DIRTY_PAGES.
+
+If one of the dirty lists is full, the guest will exit to userspace
+with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the
+KVM_RUN ioctl will return -EINTR. Once that happens, userspace
+should pause all the vcpus, then harvest all the dirty pages and
+rearm the dirty traps. It can unpause the guest after that.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 496e59a..903a016 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -235,6 +235,7 @@  struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI        25
 #define KVM_EXIT_IOAPIC_EOI       26
 #define KVM_EXIT_HYPERV           27
+#define KVM_EXIT_DIRTY_LOG_FULL   28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -932,6 +933,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_SYNIC2 148
 #define KVM_CAP_HYPERV_VP_INDEX 149
 #define KVM_CAP_S390_AIS_MIGRATION 150
+#define KVM_CAP_DIRTY_LOG_RING 151
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1358,6 +1360,8 @@  struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_S390_CMMA_MIGRATION */
 #define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
 #define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+/* Available with KVM_CAP_DIRTY_LOG_RING */
+#define KVM_RESET_DIRTY_PAGES     _IO(KVMIO,   0xba)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
@@ -1419,4 +1423,33 @@  struct kvm_assigned_msix_entry {
 #define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
 #define KVM_ARM_DEV_PMU			(1 << 2)
 
+/*
+ * The following are the requirements for supporting dirty log ring
+ * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET).
+ *
+ * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead
+ *    of kvm_write_* so that the global dirty ring is not filled up
+ *    too quickly.
+ * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for
+ *    enabling dirty logging.
+ * 3. There should not be a separate step to synchronize hardware
+ *    dirty bitmap with KVM's.
+ */
+
+struct kvm_dirty_gfn {
+	__u32 pad;
+	__u32 slot;
+	__u64 offset;
+};
+
+struct kvm_dirty_ring {
+	union {
+		struct {
+			__u16 avail_index; /* set by kernel */
+			__u16 fetch_index; /* set by userspace */
+		} indices;
+		struct kvm_dirty_gfn dirty_gfns[0];
+	};
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6e4d71c..bdccaf8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2945,6 +2945,10 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_MULTI_ADDRESS_SPACE:
 		return KVM_ADDRESS_SPACE_NUM;
 #endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	case KVM_CAP_DIRTY_LOG_RING:
+		return KVM_DIRTY_LOG_PAGE_OFFSET;
+#endif
 	case KVM_CAP_MAX_VCPU_ID:
 		return KVM_MAX_VCPU_ID;
 	default:
@@ -2953,12 +2957,37 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+	return -EINVAL;
+}
+
+static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
+{
+	return -EINVAL;
+}
+#endif
+
 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 						  struct kvm_enable_cap *cap)
 {
 	return -EINVAL;
 }
 
+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
+					   struct kvm_enable_cap *cap)
+{
+	switch (cap->cap) {
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	case KVM_CAP_DIRTY_LOG_RING:
+		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
+#endif
+	default:
+		return kvm_vm_ioctl_enable_cap(kvm, cap);
+	}
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2978,7 +3007,7 @@  static long kvm_vm_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&cap, argp, sizeof(cap)))
 			goto out;
-		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
 		break;
 	}
 	case KVM_SET_USER_MEMORY_REGION: {
@@ -3129,6 +3158,11 @@  static long kvm_vm_ioctl(struct file *filp,
 	case KVM_CHECK_EXTENSION:
 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
 		break;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+	case KVM_RESET_DIRTY_PAGES:
+		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
+		break;
+#endif /* KVM_DIRTY_LOG_PAGE_OFFSET */
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}