diff mbox

[v3,1/4] KVM: Add new generic capability for ring-based dirty memory logging

Message ID CY1PR08MB1992537C603E27BDD3427978F04F0@CY1PR08MB1992.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei Feb. 3, 2017, 8:01 p.m. UTC
Add support for capabilities that can be enabled in a generic way.
Introduce new capability: ring-based dirty memory logging

Signed-off-by: Lei Cao <lei.cao@stratus.com>
---
 Documentation/virtual/kvm/api.txt | 79 +++++++++++++++++++++++++++++++++++++--
 arch/powerpc/kvm/powerpc.c        | 14 +------
 arch/s390/kvm/kvm-s390.c          | 11 +-----
 arch/x86/kvm/x86.c                | 14 +------
 include/linux/kvm_host.h          |  2 +
 include/uapi/linux/kvm.h          |  1 +
 virt/kvm/kvm_main.c               | 32 ++++++++++++++++
 7 files changed, 115 insertions(+), 38 deletions(-)

Comments

Paolo Bonzini Feb. 4, 2017, 7:04 a.m. UTC | #1
On 03/02/2017 12:01, Cao, Lei wrote:
> +struct kvm_dirty_list {

Maybe kvm_dirty_ring?

> +       union {
> +               struct {
> +                       __u16 avail_index; /* set by kernel */
> +                       __u16 fetch_index; /* set by userspace */
> +               } indices;
> +               struct kvm_dirty_gfn dirty_gfns[0];
> +       };
> +};

Please explain what this means.  For example, adopting the free running
counter technique explained in the review of patch 5:

---
The two indices in the ring buffer are free running counters.  They are
_not_ limited to the range 0..size-1 where "size" is the number of
element of the ring buffer.  This makes it simpler to compute the number
of entries in the ring buffer, which is simply
(u16)(ring->avail_index - ring->fetch_index).

In pseudocode, processing the ring buffer looks like this:

	idx = load-acquire(&ring->fetch_index);
	while (idx != ring->avail_index) {
		struct kvm_dirty_gfn *entry;
		entry = &ring->dirty_gfns[idx & (size - 1)];
		...

		idx++;
	}
	ring->fetch_index = idx;
---

> The size of the ring should be page aligned
> +and be 16 pages at a minimum. The larger the ring, the less
> +likely the ring is full and the VM is forced to exit to
> +userspace. The optimal size is workload dependent.

---
The size of the ring must be a power of two.  The larger the ring, the
less likely the ring is full and the VM is forced to exit to userspace.
The optimal size depends on the workload, but it is recommended that it
be at least 64 KiB (4096 entries).
---

Paolo
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7..453c520 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1006,10 +1006,15 @@  documentation when it pops into existence).
 
 4.37 KVM_ENABLE_CAP
 
-Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
-Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM),
-	       mips (only KVM_CAP_ENABLE_CAP), ppc, s390
-Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
+Capability: KVM_CAP_ENABLE_CAP
+Architectures: mips, ppc, s390
+Type: vcpu ioctl
+Parameters: struct kvm_enable_cap (in)
+Returns: 0 on success; -1 on error
+
+Capability: KVM_CAP_ENABLE_CAP_VM
+Architectures: all
+Type: vcpu ioctl
 Parameters: struct kvm_enable_cap (in)
 Returns: 0 on success; -1 on error
 
@@ -3942,3 +3947,69 @@  In order to use SynIC, it has to be activated by setting this
 capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
 will disable the use of APIC hardware virtualization even if supported
 by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.3 KVM_CAP_DIRTY_LOG_RING
+
+Architectures: x86
+Parameters: args[0] - size of the dirty log ring
+
+Kernel is capable of tracking dirty memory using rings, which
+are stored in memory regions that can be mmaped into userspace.
+
+There is one dirty ring per vcpu and one global ring.
+
+The dirty ring has the following structure.
+
+struct kvm_dirty_gfn {
+        __u32 pad;
+	__u32 slot; /* as_id | slot_id */
+        __u64 offset;
+};
+
+struct kvm_dirty_list {
+       union {
+               struct {
+                       __u16 avail_index; /* set by kernel */
+                       __u16 fetch_index; /* set by userspace */
+               } indices;
+               struct kvm_dirty_gfn dirty_gfns[0];
+       };
+};
+
+Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM
+ioctl to enable this capability for the new guest and set the
+size of the rings. The size of the ring should be page aligned
+and be 16 pages at a minimum. The larger the ring, the less
+likely the ring is full and the VM is forced to exit to
+userspace. The optimal size is workload dependent.
+
+After the capability is enabled, userspace mmaps the global
+dirty ring. The per-vcpu dirty ring is mmapped along with kvm_run
+when vcpu is created.
+
+To enable dirty logging with ring, userspace calls
+KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions
+with KVM_MEM_LOG_DIRTY_PAGES bit set.
+
+To disable dirty logging with ring, userspace calls
+KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions
+with KVM_MEM_LOG_DIRTY_PAGES bit clear.
+
+Once the dirty logging is enabled, userspace can start harvesting
+dirty pages.
+
+To harvest the dirty pages, userspace accesses the mmaped dirty
+list to read the dirty GFNs up to avail_index and set the
+fetch_index accordingly. Harvest can be done when the guest is
+running or paused. Dirty pages don't need to be harvest all at
+once.
+
+To rearm the dirty traps, userspace calls KVM_RESET_DIRTY_PAGES
+ioctl. This should be done only when the guest is paused and
+all the dirty pages have been harvested.
+
+If one of the dirty lists is full, the guest will exit to userspace
+with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the
+KVM_RUN ioctl will return -EINTR. Once that happens, userspace
+should pause all the vcpus, then harvest all the dirty pages and
+rearm the dirty traps. It can unpause the guest after that.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892de..0edae1b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -507,7 +507,6 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
-	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
@@ -1358,8 +1357,8 @@  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 }
 
 
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
-				   struct kvm_enable_cap *cap)
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+			    struct kvm_enable_cap *cap)
 {
 	int r;
 
@@ -1412,15 +1411,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
-	case KVM_ENABLE_CAP:
-	{
-		struct kvm_enable_cap cap;
-		r = -EFAULT;
-		if (copy_from_user(&cap, argp, sizeof(cap)))
-			goto out;
-		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
-		break;
-	}
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CREATE_SPAPR_TCE_64: {
 		struct kvm_create_spapr_tce_64 create_tce_64;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6484a25..3192e52 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -366,7 +366,6 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_CSS_SUPPORT:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
-	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_S390_IRQCHIP:
 	case KVM_CAP_VM_ATTRIBUTES:
 	case KVM_CAP_MP_STATE:
@@ -480,7 +479,7 @@  static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
 	}
 }
 
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
 	int r;
 
@@ -1232,14 +1231,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_s390_inject_vm(kvm, &s390int);
 		break;
 	}
-	case KVM_ENABLE_CAP: {
-		struct kvm_enable_cap cap;
-		r = -EFAULT;
-		if (copy_from_user(&cap, argp, sizeof(cap)))
-			break;
-		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
-		break;
-	}
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_irq_routing_entry routing;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d153be8..1889f62 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2629,7 +2629,6 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_TIME:
 	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 	case KVM_CAP_TSC_DEADLINE_TIMER:
-	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
@@ -3868,8 +3867,8 @@  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 	return 0;
 }
 
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
-				   struct kvm_enable_cap *cap)
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+			    struct kvm_enable_cap *cap)
 {
 	int r;
 
@@ -4176,15 +4175,6 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-	case KVM_ENABLE_CAP: {
-		struct kvm_enable_cap cap;
-
-		r = -EFAULT;
-		if (copy_from_user(&cap, argp, sizeof(cap)))
-			goto out;
-		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
-		break;
-	}
 	default:
 		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c5190d..33d9974 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -718,6 +718,8 @@  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 			bool line_status);
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+			    struct kvm_enable_cap *cap);
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cac48ed..117f1f9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -871,6 +871,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_DIRTY_LOG_RING 133
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482612b..f2744ce 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2927,6 +2927,7 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #endif
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
+	case KVM_CAP_ENABLE_CAP_VM:
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 	case KVM_CAP_IRQ_ROUTING:
@@ -2944,6 +2945,28 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, __u32 size)
+{
+	return -EINVAL;
+}
+
+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+						  struct kvm_enable_cap *cap)
+{
+	return -EINVAL;
+}
+
+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
+					   struct kvm_enable_cap *cap)
+{
+	switch (cap->cap) {
+	case KVM_CAP_DIRTY_LOG_RING:
+		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
+	default:
+		return kvm_vm_ioctl_enable_cap(kvm, cap);
+	}
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2957,6 +2980,15 @@  static long kvm_vm_ioctl(struct file *filp,
 	case KVM_CREATE_VCPU:
 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
 		break;
+	case KVM_ENABLE_CAP: {
+		struct kvm_enable_cap cap;
+
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
+		break;
+	}
 	case KVM_SET_USER_MEMORY_REGION: {
 		struct kvm_userspace_memory_region kvm_userspace_mem;