@@ -168,6 +168,7 @@ Based on their initialization different VMs may have different capabilities.
It is thus encouraged to use the vm ioctl to query for capabilities (available
with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
+
4.5 KVM_GET_VCPU_MMAP_SIZE
Capability: basic
@@ -180,6 +181,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
memory region. This ioctl returns the size of that region. See the
KVM_RUN documentation for details.
+Besides the size of the KVM_RUN communication region, other areas of
+the VCPU file descriptor can be mmap-ed, including:
+
+- if KVM_CAP_COALESCED_MMIO is available, a page at
+ KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
+ this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
+ KVM_CAP_COALESCED_MMIO is not documented yet.
+
+- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
+ KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE. For more information on
+ KVM_CAP_DIRTY_LOG_RING, see section 8.3.
+
4.6 KVM_SET_MEMORY_REGION
@@ -4374,3 +4387,86 @@ Parameters: none
This capability indicates if the flic device will be able to get/set the
AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
to discover this without having to create a flic device.
+
+8.14 KVM_CAP_DIRTY_LOG_RING
+
+Architectures: x86
+Parameters: args[0] - size of the dirty log ring
+
+Kernel is capable of tracking dirty memory using rings, which
+are stored in memory regions that can be mmaped into userspace.
+
+There is one dirty ring per vcpu and one global ring.
+
+The dirty ring has the following structure.
+
+struct kvm_dirty_gfn {
+ __u32 pad;
+ __u32 slot; /* as_id | slot_id */
+ __u64 offset;
+};
+
+struct kvm_dirty_ring {
+ union {
+ struct {
+ __u16 avail_index; /* set by kernel */
+ __u16 fetch_index; /* set by userspace */
+ } indices;
+ struct kvm_dirty_gfn dirty_gfns[0];
+ };
+};
+
+The two indices in the ring buffer are free running counters.
+They are _not_ limited to the range 0..size-1 where "size" is
+the number of element of the ring buffer. This makes it easy
+to compute the number of entries in the ring buffer, which is
+simply (u16)(ring->avail_index - ring->fetch_index).
+
+In pseudocode, processing the ring buffer looks like this:
+
+ idx = load-acquire(&ring->fetch_index);
+ while (idx != ring->avail_index) {
+ struct kvm_dirty_gfn *entry;
+ entry = &ring->dirty_gfns[idx & (size - 1)];
+ ...
+
+ idx++;
+ }
+ ring->fetch_index = idx;
+
+Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
+to enable this capability for the new guest and set the size of the
+rings. The size of the ring must be a power of two. The larger the
+ring buffer, the less likely the ring is full and the VM is forced to
+exit to userspace. The optimal size depends on the workload, but it is
+recommended that it be at least 64 KiB (4096 entries).
+
+After the capability is enabled, userspace mmaps the global ring
+buffer from the VM file descriptor. The per-vcpu dirty ring instead
+is mmapped when the vcpu is created, similar to the kvm_run struct.
+The per-vcpu dirty ring is located at offset KVM_DIRTY_LOG_PAGE_OFFSET *
+PAGE_SIZE of the memory mapped region.
+
+To enable the dirty logging ring buffer, userspace calls
+KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions
+with KVM_MEM_LOG_DIRTY_PAGES bit set.
+
+To disable the dirty logging ring buffer, userspace calls
+KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions
+with KVM_MEM_LOG_DIRTY_PAGES bit clear.
+
+Once the dirty logging is enabled, userspace can start harvesting
+dirty pages.
+
+To harvest the dirty pages, userspace accesses the mmaped ring
+buffer to read the dirty GFNs up to avail_index and set the
+fetch_index accordingly. Harvest can be done when the guest is
+running or paused. Dirty pages don't need to be harvest all at
+once. To rearm the dirty traps, userspace calls the VM ioctl
+KVM_RESET_DIRTY_PAGES.
+
+If one of the dirty lists is full, the guest will exit to userspace
+with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the
+KVM_RUN ioctl will return -EINTR. Once that happens, userspace
+should pause all the vcpus, then harvest all the dirty pages and
+rearm the dirty traps. It can unpause the guest after that.
@@ -235,6 +235,7 @@ struct kvm_hyperv_exit {
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
+#define KVM_EXIT_DIRTY_LOG_FULL 28
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -932,6 +933,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_HYPERV_SYNIC2 148
#define KVM_CAP_HYPERV_VP_INDEX 149
#define KVM_CAP_S390_AIS_MIGRATION 150
+#define KVM_CAP_DIRTY_LOG_RING 151
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1358,6 +1360,8 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_S390_CMMA_MIGRATION */
#define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
#define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+/* Available with KVM_CAP_DIRTY_LOG_RING */
+#define KVM_RESET_DIRTY_PAGES _IO(KVMIO, 0xba)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
@@ -1419,4 +1423,33 @@ struct kvm_assigned_msix_entry {
#define KVM_ARM_DEV_EL1_PTIMER (1 << 1)
#define KVM_ARM_DEV_PMU (1 << 2)
+/*
+ * The following are the requirements for supporting dirty log ring
+ * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET).
+ *
+ * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead
+ * of kvm_write_* so that the global dirty ring is not filled up
+ * too quickly.
+ * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for
+ * enabling dirty logging.
+ * 3. There should not be a separate step to synchronize hardware
+ * dirty bitmap with KVM's.
+ */
+
+struct kvm_dirty_gfn {
+ __u32 pad;
+ __u32 slot;
+ __u64 offset;
+};
+
+struct kvm_dirty_ring {
+ union {
+ struct {
+ __u16 avail_index; /* set by kernel */
+ __u16 fetch_index; /* set by userspace */
+ } indices;
+ struct kvm_dirty_gfn dirty_gfns[0];
+ };
+};
+
#endif /* __LINUX_KVM_H */
@@ -2945,6 +2945,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_MULTI_ADDRESS_SPACE:
return KVM_ADDRESS_SPACE_NUM;
#endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ case KVM_CAP_DIRTY_LOG_RING:
+ return KVM_DIRTY_LOG_PAGE_OFFSET;
+#endif
case KVM_CAP_MAX_VCPU_ID:
return KVM_MAX_VCPU_ID;
default:
@@ -2953,12 +2957,37 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return kvm_vm_ioctl_check_extension(kvm, arg);
}
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+ return -EINVAL;
+}
+
+static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
+{
+ return -EINVAL;
+}
+#endif
+
int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
return -EINVAL;
}
+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ switch (cap->cap) {
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ case KVM_CAP_DIRTY_LOG_RING:
+ return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
+#endif
+ default:
+ return kvm_vm_ioctl_enable_cap(kvm, cap);
+ }
+}
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2978,7 +3007,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&cap, argp, sizeof(cap)))
goto out;
- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
break;
}
case KVM_SET_USER_MEMORY_REGION: {
@@ -3129,6 +3158,11 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
break;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ case KVM_RESET_DIRTY_PAGES:
+ r = kvm_vm_ioctl_reset_dirty_pages(kvm);
+ break;
+#endif /* KVM_DIRTY_LOG_PAGE_OFFSET */
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}