@@ -8,7 +8,8 @@ CFLAGS_vmx.o := -I.
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+ $(KVM)/gfn_ring.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
@@ -34,6 +34,7 @@
#include <linux/kvm_types.h>
#include <asm/kvm_host.h>
+#include <linux/kvm_gfn_ring.h>
#ifndef KVM_MAX_VCPU_ID
#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -265,6 +266,11 @@ struct kvm_vcpu {
bool preempted;
struct kvm_vcpu_arch arch;
struct dentry *debugfs_dentry;
+
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ u32 max_dirty_logs;
+ struct kvm_gfn_ring dirty_ring;
+#endif
};
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -430,6 +436,12 @@ struct kvm {
struct list_head devices;
struct dentry *debugfs_dentry;
struct kvm_stat_data **debugfs_stat_data;
+
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ u32 dirty_ring_size;
+ u32 max_dirty_logs;
+ struct kvm_gfn_ring dirty_ring;
+#endif
};
#define kvm_err(fmt, ...) \
@@ -713,6 +725,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset,
unsigned long mask);
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
+
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log);
new file mode 100644
@@ -0,0 +1,100 @@
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_gfn_ring.h>
+
+int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, u32 size)
+{
+ gfnring->dirty_list = vmalloc(size);
+ if (!gfnring->dirty_list)
+ return -ENOMEM;
+ memset(gfnring->dirty_list, 0, size);
+
+ gfnring->size = size/sizeof(struct kvm_dirty_gfn);
+ gfnring->dirty_index = 0;
+ gfnring->reset_index = 0;
+ spin_lock_init(&gfnring->lock);
+
+ return 0;
+}
+
+int kvm_gfn_ring_reset(struct kvm *kvm, struct kvm_gfn_ring *gfnring)
+{
+ u32 cur_slot, next_slot;
+ u64 cur_offset, next_offset;
+ unsigned long mask = 0;
+ u32 fetch;
+ int count = 0;
+ struct kvm_dirty_list *list = gfnring->dirty_list;
+
+ fetch = READ_ONCE(list->indices.fetch_index);
+ if (fetch == gfnring->reset_index)
+ return 0;
+
+ cur_slot = READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot);
+ cur_offset = READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset);
+ mask = 1;
+ count++;
+ gfnring->reset_index = (gfnring->reset_index + 1) % gfnring->size;
+ while (gfnring->reset_index != fetch) {
+ next_slot =
+ READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot);
+ next_offset =
+ READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset);
+ if ((next_slot != cur_slot) ||
+ (next_offset < cur_offset) ||
+ ((next_offset - cur_offset) > (BITS_PER_LONG - 1))) {
+ kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+ cur_slot = next_slot;
+ cur_offset = next_offset;
+ mask = 1;
+ } else
+ mask |= (u64)1 << (next_offset - cur_offset);
+ count++;
+ gfnring->reset_index = (gfnring->reset_index + 1) %
+ gfnring->size;
+ }
+ kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+ return count;
+}
+
+int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring,
+ u32 slot,
+ u64 offset,
+ bool locked)
+{
+ int num;
+ struct kvm_dirty_list *list = gfnring->dirty_list;
+
+ if (((gfnring->dirty_index + 1) % gfnring->size) ==
+ gfnring->reset_index)
+ return -EBUSY;
+
+ if (locked)
+ spin_lock(&gfnring->lock);
+
+ list->dirty_gfns[gfnring->dirty_index].slot = slot;
+ list->dirty_gfns[gfnring->dirty_index].offset = offset;
+ smp_wmb();
+ gfnring->dirty_index = (gfnring->dirty_index+1) % gfnring->size;
+ num = (gfnring->dirty_index - gfnring->reset_index) % gfnring->size;
+ list->indices.avail_index = gfnring->dirty_index;
+
+ if (locked)
+ spin_unlock(&gfnring->lock);
+
+ return num;
+}
+
+struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i)
+{
+ return vmalloc_to_page((void *)ring->dirty_list+i*PAGE_SIZE);
+
+}
+
+void kvm_gfn_ring_free(struct kvm_gfn_ring *gfnring)
+{
+ if (gfnring->dirty_list)
+ vfree(gfnring->dirty_list);
+}
@@ -63,6 +63,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
+#include <linux/kvm_gfn_ring.h>
+
/* Worst case buffer size needed for holding an integer. */
#define ITOA_MAX_LEN 12
@@ -121,7 +123,16 @@ static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *memslot,
+ gfn_t gfn);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn);
+#endif
__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -258,11 +269,34 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
kvm_vcpu_set_dy_eligible(vcpu, false);
vcpu->preempted = false;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ if (kvm->dirty_ring_size) {
+ r = kvm_gfn_ring_alloc(&vcpu->dirty_ring,
+ kvm->dirty_ring_size);
+ if (r) {
+ kvm->dirty_ring_size = 0;
+ goto fail_free_run;
+ }
+ vcpu->max_dirty_logs =
+ (kvm->dirty_ring_size/sizeof(struct kvm_dirty_gfn))
+ - 1 - kvm_cpu_dirty_log_size();
+ }
+#endif
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ goto fail_free_ring;
+#else
goto fail_free_run;
+#endif
return 0;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+fail_free_ring:
+ if (kvm->dirty_ring_size)
+ kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
fail_free_run:
free_page((unsigned long)vcpu->run);
fail:
@@ -275,6 +309,10 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
put_pid(vcpu->pid);
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ if (vcpu->kvm->dirty_ring_size)
+ kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
@@ -726,6 +764,10 @@ static void kvm_destroy_vm(struct kvm *kvm)
for (i = 0; i < KVM_NR_BUSES; i++)
kvm_io_bus_destroy(kvm->buses[i]);
kvm_coalesced_mmio_free(kvm);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ if (kvm->dirty_ring_size)
+ kvm_gfn_ring_free(&kvm->dirty_ring);
+#endif
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
@@ -1861,7 +1903,8 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
}
EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
-static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *memslot, gfn_t gfn,
const void *data, int offset, int len)
{
int r;
@@ -1873,7 +1916,7 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
r = __copy_to_user((void __user *)addr + offset, data, len);
if (r)
return -EFAULT;
- mark_page_dirty_in_slot(memslot, gfn);
+ mark_page_dirty_in_slot(kvm, vcpu, memslot, gfn);
return 0;
}
@@ -1882,7 +1925,8 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
{
struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
- return __kvm_write_guest_page(slot, gfn, data, offset, len);
+ return __kvm_write_guest_page(kvm, NULL, slot, gfn, data,
+ offset, len);
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);
@@ -1891,7 +1935,8 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return __kvm_write_guest_page(slot, gfn, data, offset, len);
+ return __kvm_write_guest_page(vcpu->kvm, vcpu, slot, gfn, data,
+ offset, len);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
@@ -1995,7 +2040,7 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cac
r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
if (r)
return -EFAULT;
- mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
+ mark_page_dirty_in_slot(v->kvm, v, ghc->memslot, gpa >> PAGE_SHIFT);
return 0;
}
@@ -2060,12 +2105,17 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *memslot,
gfn_t gfn)
{
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ mark_page_dirty_in_ring(kvm, vcpu, memslot, gfn);
+#endif
set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -2075,7 +2125,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
struct kvm_memory_slot *memslot;
memslot = gfn_to_memslot(kvm, gfn);
- mark_page_dirty_in_slot(memslot, gfn);
+ mark_page_dirty_in_slot(kvm, NULL, memslot, gfn);
}
EXPORT_SYMBOL_GPL(mark_page_dirty);
@@ -2084,7 +2134,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
struct kvm_memory_slot *memslot;
memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- mark_page_dirty_in_slot(memslot, gfn);
+ mark_page_dirty_in_slot(vcpu->kvm, vcpu, memslot, gfn);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
@@ -2363,6 +2413,11 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ else if (vmf->pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET)
+ page = kvm_gfn_ring_get_page(&vcpu->dirty_ring,
+ vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
+#endif
else
return kvm_arch_vcpu_fault(vcpu, vmf);
get_page(page);
@@ -2946,14 +3001,118 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
}
#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
-static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, __u32 size)
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn)
{
- return -EINVAL;
+ struct kvm_gfn_ring *gfnlist;
+ u32 as_id = 0;
+ u64 offset;
+ struct kvm_vcpu *exit_vcpu = vcpu;
+ int num;
+ bool locked;
+ u32 max;
+
+ if (!kvm->dirty_ring_size)
+ return;
+
+ offset = gfn - slot->base_gfn;
+
+ if (test_bit_le(offset, slot->dirty_bitmap))
+ return;
+
+ if (vcpu)
+ as_id = kvm_arch_vcpu_memslots_id(vcpu);
+
+ locked = (vcpu == NULL);
+
+ if (vcpu) {
+ gfnlist = &vcpu->dirty_ring;
+ max = vcpu->max_dirty_logs;
+ } else {
+ gfnlist = &kvm->dirty_ring;
+ max = kvm->max_dirty_logs;
+ }
+
+ num = kvm_gfn_ring_push(gfnlist, (as_id << 16)|slot->id,
+ offset, locked);
+ if (num < 0) {
+ if (vcpu)
+ WARN_ONCE(1, "vcpu %d dirty log overflow\n",
+ vcpu->vcpu_id);
+ else
+ WARN_ONCE(1, "global dirty log overflow\n");
+ return;
+ }
+
+ if (num == max) {
+ if (!exit_vcpu)
+ exit_vcpu = kvm->vcpus[0];
+ kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, exit_vcpu);
+ }
+}
+
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+ struct kvm_memory_slot *memslot;
+ int as_id, id;
+
+ as_id = slot >> 16;
+ id = (u16)slot;
+ if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ return;
+
+ memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+ if (offset >= memslot->npages)
+ return;
+
+ spin_lock(&kvm->mmu_lock);
+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+ spin_unlock(&kvm->mmu_lock);
+
+ while (mask) {
+ clear_bit_le(offset + __ffs(mask), memslot->dirty_bitmap);
+ mask &= mask - 1;
+ }
+}
+
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+ int r;
+
+ kvm->dirty_ring_size = size;
+ kvm->max_dirty_logs = (size/sizeof(struct kvm_dirty_gfn)) - 1;
+ r = kvm_gfn_ring_alloc(&kvm->dirty_ring, size);
+ if (r) {
+ kvm_put_kvm(kvm);
+ return r;
+ }
+ return 0;
}
static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
{
- return -EINVAL;
+ int i;
+ struct kvm_vcpu *vcpu;
+ int cleared = 0;
+
+ if (!kvm->dirty_ring_size)
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ cleared += kvm_gfn_ring_reset(kvm, &vcpu->dirty_ring);
+
+ cleared += kvm_gfn_ring_reset(kvm, &kvm->dirty_ring);
+
+ mutex_unlock(&kvm->slots_lock);
+
+ if (cleared)
+ kvm_flush_remote_tlbs(kvm);
+
+ return cleared;
}
#endif
@@ -3202,6 +3361,29 @@ static long kvm_vm_compat_ioctl(struct file *filp,
}
#endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct kvm *kvm = vma->vm_file->private_data;
+ struct page *page;
+
+ page = kvm_gfn_ring_get_page(&kvm->dirty_ring, vmf->pgoff);
+ get_page(page);
+ vmf->page = page;
+ return 0;
+}
+
+static const struct vm_operations_struct kvm_vm_vm_ops = {
+ .fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &kvm_vm_vm_ops;
+ return 0;
+}
+#endif
+
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
@@ -3209,6 +3391,9 @@ static struct file_operations kvm_vm_fops = {
.compat_ioctl = kvm_vm_compat_ioctl,
#endif
.llseek = noop_llseek,
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ .mmap = kvm_vm_mmap,
+#endif
};
static int kvm_dev_ioctl_create_vm(unsigned long type)
Implement ring-base dirty memory tracking. Signed-off-by: Lei Cao <lei.cao@stratus.com> --- arch/x86/kvm/Makefile | 3 +- include/linux/kvm_host.h | 14 ++++ virt/kvm/gfn_ring.c | 100 +++++++++++++++++++++++ virt/kvm/kvm_main.c | 209 ++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 313 insertions(+), 13 deletions(-) create mode 100644 virt/kvm/gfn_ring.c