@@ -9,7 +9,8 @@ CFLAGS_vmx.o := -I.
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+ $(KVM)/gfn_ring.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
new file mode 100644
@@ -0,0 +1,68 @@
+#ifndef KVM_GFN_RING_H
+#define KVM_GFN_RING_H
+
+/*
+ * struct kvm_dirty_ring is defined in include/uapi/linux/kvm.h.
+ *
+ * dirty_ring: shared with userspace via mmap. dirty_ring->dirty_gfns
+ * is the compact list that holds the dirty pages.
+ * dirty_index: free running counter that points to the next slot in
+ * dirty_ring->dirty_gfns where a new dirty page should go.
+ * reset_index: free running counter that points to the next dirty page
+ * in dirty_ring->dirty_gfns for which dirty trap needs to
+ * be reenabled
+ * size: size of the compact list, dirty_ring->dirty_gfns
+ * soft_limit: when the number of dirty pages in the list reaches this
+ * limit, vcpu that owns this ring should exit to userspace
+ * to allow userspace to harvest all the dirty pages
+ * lock: protects dirty_ring, only in use if this is the global
+ * ring
+ *
+ * The number of dirty pages in the ring is calculated by,
+ * dirty_index - reset_index
+ *
+ * kernel increments dirty_ring->indices.avail_index after dirty index
+ * is incremented. When userspace harvests the dirty pages, it increments
+ * dirty_ring->indices.fetch_index up to dirty_ring->indices.avail_index.
+ * When kernel reenables dirty traps for the dirty pages, it increments
+ * reset_index up to dirty_ring->indices.fetch_index.
+ *
+ */
+struct kvm_gfn_ring {
+ u16 dirty_index;
+ u16 reset_index;
+ u32 size;
+ u32 soft_limit;
+ spinlock_t lock;
+ struct kvm_dirty_ring *dirty_ring;
+};
+
+int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring,
+ u32 size,
+ u32 limit);
+
+/*
+ * called with kvm->slots_lock held, returns the number of
+ * processed pages.
+ */
+int kvm_gfn_ring_reset(struct kvm *kvm,
+ struct kvm_gfn_ring *gfnring);
+
+/*
+ * returns 0: successfully pushed
+ * 1: successfully pushed, soft limit reached,
+ * vcpu should exit to userspace
+ * -EBUSY: unable to push, dirty ring full.
+ */
+int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring,
+ u32 slot,
+ u64 offset,
+ bool locked);
+
+/* for use in vm_operations_struct */
+struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring,
+ u32 i);
+
+void kvm_gfn_ring_free(struct kvm_gfn_ring *ring);
+
+#endif
@@ -35,6 +35,7 @@
#include <linux/kvm_types.h>
#include <asm/kvm_host.h>
+#include <linux/kvm_gfn_ring.h>
#ifndef KVM_MAX_VCPU_ID
#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -126,6 +127,7 @@ static inline bool is_error_page(struct page *page)
#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_PENDING_TIMER 2
#define KVM_REQ_UNHALT 3
+#define KVM_REQ_EXIT_DIRTY_LOG_FULL 4
#define KVM_REQUEST_ARCH_BASE 8
#define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -274,6 +276,10 @@ struct kvm_vcpu {
bool preempted;
struct kvm_vcpu_arch arch;
struct dentry *debugfs_dentry;
+
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ struct kvm_gfn_ring dirty_ring;
+#endif
};
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -441,6 +447,10 @@ struct kvm {
#endif
long tlbs_dirty;
struct list_head devices;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ u32 dirty_ring_size;
+ struct kvm_gfn_ring dirty_ring;
+#endif
struct dentry *debugfs_dentry;
struct kvm_stat_data **debugfs_stat_data;
struct srcu_struct srcu;
@@ -751,6 +761,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset,
unsigned long mask);
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
+
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log);
new file mode 100644
@@ -0,0 +1,135 @@
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_gfn_ring.h>
+
+int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, u32 size, u32 limit)
+{
+ gfnring->dirty_ring = vmalloc(size);
+ if (!gfnring->dirty_ring)
+ return -ENOMEM;
+ memset(gfnring->dirty_ring, 0, size);
+
+ gfnring->size = size/sizeof(struct kvm_dirty_gfn);
+ gfnring->soft_limit = limit;
+ gfnring->dirty_index = 0;
+ gfnring->reset_index = 0;
+ spin_lock_init(&gfnring->lock);
+
+ return 0;
+}
+
+int kvm_gfn_ring_reset(struct kvm *kvm, struct kvm_gfn_ring *gfnring)
+{
+ u32 cur_slot, next_slot;
+ u64 cur_offset, next_offset;
+ unsigned long mask;
+ u32 fetch;
+ int count = 0;
+ struct kvm_dirty_gfn *entry;
+ struct kvm_dirty_ring *ring = gfnring->dirty_ring;
+
+ fetch = READ_ONCE(ring->indices.fetch_index);
+ if (fetch == gfnring->reset_index)
+ return 0;
+
+ entry = &ring->dirty_gfns[gfnring->reset_index &
+ (gfnring->size - 1)];
+ /*
+ * The ring buffer is shared with userspace, which might mmap
+ * it and concurrently modify slot and offset. Userspace must
+ * not be trusted! READ_ONCE prevents the compiler from changing
+ * the values after they've been range-checked (the checks are
+ * in kvm_reset_dirty_gfn).
+ */
+ smp_read_barrier_depends();
+ cur_slot = READ_ONCE(entry->slot);
+ cur_offset = READ_ONCE(entry->offset);
+ mask = 1;
+ count++;
+ gfnring->reset_index++;
+ while (gfnring->reset_index != fetch) {
+ entry = &ring->dirty_gfns[gfnring->reset_index &
+ (gfnring->size - 1)];
+ smp_read_barrier_depends();
+ next_slot = READ_ONCE(entry->slot);
+ next_offset = READ_ONCE(entry->offset);
+ gfnring->reset_index++;
+ count++;
+ /*
+ * Try to coalesce the reset operations when the guest is
+ * scanning pages in the same slot.
+ */
+ if (next_slot == cur_slot) {
+ int delta = next_offset - cur_offset;
+
+ if (delta >= 0 && delta < BITS_PER_LONG) {
+ mask |= 1ull << delta;
+ continue;
+ }
+
+ /* Backwards visit, careful about overflows! */
+ if (delta > -BITS_PER_LONG && delta < 0 &&
+ (mask << -delta >> -delta) == mask) {
+ cur_offset = next_offset;
+ mask = (mask << -delta) | 1;
+ continue;
+ }
+ }
+ kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+ cur_slot = next_slot;
+ cur_offset = next_offset;
+ mask = 1;
+ }
+ kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+ return count;
+}
+
+int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring,
+ u32 slot,
+ u64 offset,
+ bool locked)
+{
+ int ret;
+ u16 num;
+ struct kvm_dirty_gfn *entry;
+
+ if (locked)
+ spin_lock(&gfnring->lock);
+
+ num = (u16)(gfnring->dirty_index - gfnring->reset_index);
+ if (num >= gfnring->size) {
+ WARN_ON_ONCE(num > gfnring->size);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ entry = &gfnring->dirty_ring->dirty_gfns[gfnring->dirty_index &
+ (gfnring->size - 1)];
+ entry->slot = slot;
+ entry->offset = offset;
+ smp_wmb();
+ gfnring->dirty_index++;
+ num = gfnring->dirty_index - gfnring->reset_index;
+ gfnring->dirty_ring->indices.avail_index = gfnring->dirty_index;
+ ret = num >= gfnring->soft_limit;
+
+out:
+ if (locked)
+ spin_unlock(&gfnring->lock);
+
+ return ret;
+}
+
+struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i)
+{
+ return vmalloc_to_page((void *)ring->dirty_ring+i*PAGE_SIZE);
+
+}
+
+void kvm_gfn_ring_free(struct kvm_gfn_ring *gfnring)
+{
+ if (gfnring->dirty_ring)
+ vfree(gfnring->dirty_ring);
+}
@@ -65,9 +65,16 @@
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
+#include <linux/kvm_gfn_ring.h>
+
/* Worst case buffer size needed for holding an integer. */
#define ITOA_MAX_LEN 12
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+/* some buffer space for the dirty log ring for ring full situations */
+#define DIRTY_RING_BUFFER_ENTRY_NUM 16
+#endif
+
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -127,6 +134,12 @@ static void mark_page_dirty_in_slot(struct kvm *kvm,
struct kvm_vcpu *vcpu,
struct kvm_memory_slot *memslot,
gfn_t gfn);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn);
+#endif
__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -296,11 +309,36 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
kvm_vcpu_set_dy_eligible(vcpu, false);
vcpu->preempted = false;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ if (kvm->dirty_ring_size) {
+ u32 limit = (kvm->dirty_ring_size /
+ sizeof(struct kvm_dirty_gfn)) -
+ DIRTY_RING_BUFFER_ENTRY_NUM -
+ kvm_cpu_dirty_log_size();
+ r = kvm_gfn_ring_alloc(&vcpu->dirty_ring,
+ kvm->dirty_ring_size,
+ limit);
+ if (r) {
+ kvm->dirty_ring_size = 0;
+ goto fail_free_run;
+ }
+ }
+#endif
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ goto fail_free_ring;
+#else
goto fail_free_run;
+#endif
return 0;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+fail_free_ring:
+ if (kvm->dirty_ring_size)
+ kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
fail_free_run:
free_page((unsigned long)vcpu->run);
fail:
@@ -318,6 +356,10 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
put_pid(rcu_dereference_protected(vcpu->pid, 1));
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ if (vcpu->kvm->dirty_ring_size)
+ kvm_gfn_ring_free(&vcpu->dirty_ring);
+#endif
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
@@ -727,6 +769,10 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ if (kvm->dirty_ring_size)
+ kvm_gfn_ring_free(&kvm->dirty_ring);
+#endif
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
@@ -2057,6 +2103,9 @@ static void mark_page_dirty_in_slot(struct kvm *kvm,
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ mark_page_dirty_in_ring(kvm, vcpu, memslot, gfn);
+#endif
set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -2383,6 +2432,13 @@ static int kvm_vcpu_fault(struct vm_fault *vmf)
else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ else if ((vmf->pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
+ (vmf->pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
+ vcpu->kvm->dirty_ring_size / PAGE_SIZE))
+ page = kvm_gfn_ring_get_page(&vcpu->dirty_ring,
+ vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
+#endif
else
return kvm_arch_vcpu_fault(vcpu, vmf);
get_page(page);
@@ -2966,14 +3022,128 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
}
#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static void mark_page_dirty_in_ring(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ struct kvm_gfn_ring *gfnlist;
+ u32 as_id = 0;
+ u64 offset;
+ struct kvm_vcpu *exit_vcpu;
+ struct kvm_vcpu *ring_vcpu;
+ int ret;
+ bool locked = false;
+
+ if (!kvm->dirty_ring_size)
+ return;
+
+ offset = gfn - slot->base_gfn;
+
+ if (test_bit_le(offset, slot->dirty_bitmap))
+ return;
+
+ if (vcpu) {
+ as_id = kvm_arch_vcpu_memslots_id(vcpu);
+ ring_vcpu = vcpu;
+ } else {
+ as_id = 0;
+ ring_vcpu = kvm_get_running_vcpu();
+ }
+
+ if (ring_vcpu) {
+ gfnlist = &ring_vcpu->dirty_ring;
+ exit_vcpu = ring_vcpu;
+ } else {
+ gfnlist = &kvm->dirty_ring;
+ exit_vcpu = kvm->vcpus[0];
+ locked = true;
+ }
+
+ ret = kvm_gfn_ring_push(gfnlist, (as_id << 16)|slot->id,
+ offset, locked);
+ if (ret < 0) {
+ if (vcpu)
+ WARN_ONCE(1, "vcpu %d dirty log overflow\n",
+ vcpu->vcpu_id);
+ else
+ WARN_ONCE(1, "global dirty log overflow\n");
+ return;
+ }
+
+ if (ret)
+ kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, exit_vcpu);
+}
+
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+ struct kvm_memory_slot *memslot;
+ int as_id, id;
+
+ as_id = slot >> 16;
+ id = (u16)slot;
+ if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ return;
+
+ memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+ if (offset >= memslot->npages)
+ return;
+
+ spin_lock(&kvm->mmu_lock);
+ /* FIXME: we should use a single AND operation, but there is no
+ * applicable atomic API.
+ */
+ while (mask) {
+ clear_bit_le(offset + __ffs(mask), memslot->dirty_bitmap);
+ mask &= mask - 1;
+ }
+
+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+ spin_unlock(&kvm->mmu_lock);
+}
+
static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
{
- return -EINVAL;
+ int r;
+ u32 limit;
+
+ /* the size should be power of 2 */
+ if (!size || (size & (size - 1)))
+ return -EINVAL;
+
+ kvm->dirty_ring_size = size;
+ limit = (size/sizeof(struct kvm_dirty_gfn)) -
+ DIRTY_RING_BUFFER_ENTRY_NUM;
+ r = kvm_gfn_ring_alloc(&kvm->dirty_ring, size, limit);
+ if (r) {
+ kvm_put_kvm(kvm);
+ return r;
+ }
+ return 0;
}
static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
{
- return -EINVAL;
+ int i;
+ struct kvm_vcpu *vcpu;
+ int cleared = 0;
+
+ if (!kvm->dirty_ring_size)
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ cleared += kvm_gfn_ring_reset(kvm, &vcpu->dirty_ring);
+
+ cleared += kvm_gfn_ring_reset(kvm, &kvm->dirty_ring);
+
+ mutex_unlock(&kvm->slots_lock);
+
+ if (cleared)
+ kvm_flush_remote_tlbs(kvm);
+
+ return cleared;
}
#endif
@@ -3219,6 +3389,29 @@ static long kvm_vm_compat_ioctl(struct file *filp,
}
#endif
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+static int kvm_vm_fault(struct vm_fault *vmf)
+{
+ struct kvm *kvm = vmf->vma->vm_file->private_data;
+ struct page *page;
+
+ page = kvm_gfn_ring_get_page(&kvm->dirty_ring, vmf->pgoff);
+ get_page(page);
+ vmf->page = page;
+ return 0;
+}
+
+static const struct vm_operations_struct kvm_vm_vm_ops = {
+ .fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &kvm_vm_vm_ops;
+ return 0;
+}
+#endif
+
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
@@ -3226,6 +3419,9 @@ static long kvm_vm_compat_ioctl(struct file *filp,
.compat_ioctl = kvm_vm_compat_ioctl,
#endif
.llseek = noop_llseek,
+#ifdef KVM_DIRTY_LOG_PAGE_OFFSET
+ .mmap = kvm_vm_mmap,
+#endif
};
static int kvm_dev_ioctl_create_vm(unsigned long type)