[v3,5/6] KVM: Implement ring-based dirty memory tracking

Message ID	BYAPR08MB397312D2071AA41384E8A99AF0710@BYAPR08MB3973.namprd08.prod.outlook.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: "Cao, Lei" <Lei.Cao@stratus.com> To: "kvm@vger.kernel.org" <kvm@vger.kernel.org> Subject: [PATCH v3 5/6] KVM: Implement ring-based dirty memory tracking Thread-Topic: [PATCH v3 5/6] KVM: Implement ring-based dirty memory tracking Thread-Index: AQHUBwdatdYlrCeK+0GYYrYy7YFWlg== Date: Mon, 18 Jun 2018 13:22:05 +0000 Message-ID: <BYAPR08MB397312D2071AA41384E8A99AF0710@BYAPR08MB3973.namprd08.prod.outlook.com> References: <201806181256.w5ICurF8024418@dev1.evr.stratus.com> Accept-Language: en-US Content-Language: en-US spamdiagnosticoutput: 1:99 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 Content-Type: text/plain; charset=WINDOWS-1252 Content-Transfer-Encoding: quoted-printable Sender: kvm-owner@vger.kernel.org Precedence: bulk

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index dc4f2fd..19fdd31 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,8 @@ CFLAGS_vmx.o := -I. KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ + $(KVM)/gfn_ring.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ diff --git a/include/linux/kvm_gfn_ring.h b/include/linux/kvm_gfn_ring.h new file mode 100644 index 0000000..9d5ca99 --- /dev/null +++ b/include/linux/kvm_gfn_ring.h @@ -0,0 +1,68 @@ +#ifndef KVM_GFN_RING_H +#define KVM_GFN_RING_H + +/* + * struct kvm_dirty_ring is defined in include/uapi/linux/kvm.h. + * + * dirty_ring: shared with userspace via mmap. dirty_ring->dirty_gfns + * is the compact list that holds the dirty pages. + * dirty_index: free running counter that points to the next slot in + * dirty_ring->dirty_gfns where a new dirty page should go. + * reset_index: free running counter that points to the next dirty page + * in dirty_ring->dirty_gfns for which dirty trap needs to + * be reenabled + * size: size of the compact list, dirty_ring->dirty_gfns + * soft_limit: when the number of dirty pages in the list reaches this + * limit, vcpu that owns this ring should exit to userspace + * to allow userspace to harvest all the dirty pages + * lock: protects dirty_ring, only in use if this is the global + * ring + * + * The number of dirty pages in the ring is calculated by, + * dirty_index - reset_index + * + * kernel increments dirty_ring->indices.avail_index after dirty index + * is incremented. When userspace harvests the dirty pages, it increments + * dirty_ring->indices.fetch_index up to dirty_ring->indices.avail_index. + * When kernel reenables dirty traps for the dirty pages, it increments + * reset_index up to dirty_ring->indices.fetch_index. + * + */ +struct kvm_gfn_ring { + u16 dirty_index; + u16 reset_index; + u32 size; + u32 soft_limit; + spinlock_t lock; + struct kvm_dirty_ring *dirty_ring; +}; + +int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, + u32 size, + u32 limit); + +/* + * called with kvm->slots_lock held, returns the number of + * processed pages. + */ +int kvm_gfn_ring_reset(struct kvm *kvm, + struct kvm_gfn_ring *gfnring); + +/* + * returns 0: successfully pushed + * 1: successfully pushed, soft limit reached, + * vcpu should exit to userspace + * -EBUSY: unable to push, dirty ring full. + */ +int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring, + u32 slot, + u64 offset, + bool locked); + +/* for use in vm_operations_struct */ +struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, + u32 i); + +void kvm_gfn_ring_free(struct kvm_gfn_ring *ring); + +#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 11e891a..feee06c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -35,6 +35,7 @@ #include <linux/kvm_types.h> #include <asm/kvm_host.h> +#include <linux/kvm_gfn_ring.h> #ifndef KVM_MAX_VCPU_ID #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS @@ -126,6 +127,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_PENDING_TIMER 2 #define KVM_REQ_UNHALT 3 +#define KVM_REQ_EXIT_DIRTY_LOG_FULL 4 #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -274,6 +276,10 @@ struct kvm_vcpu { bool preempted; struct kvm_vcpu_arch arch; struct dentry *debugfs_dentry; + +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + struct kvm_gfn_ring dirty_ring; +#endif }; static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) @@ -441,6 +447,10 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + u32 dirty_ring_size; + struct kvm_gfn_ring dirty_ring; +#endif struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@ -751,6 +761,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask); + int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); diff --git a/virt/kvm/gfn_ring.c b/virt/kvm/gfn_ring.c new file mode 100644 index 0000000..cb0f455 --- /dev/null +++ b/virt/kvm/gfn_ring.c @@ -0,0 +1,135 @@ +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/vmalloc.h> +#include <linux/kvm_gfn_ring.h> + +int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, u32 size, u32 limit) +{ + gfnring->dirty_ring = vmalloc(size); + if (!gfnring->dirty_ring) + return -ENOMEM; + memset(gfnring->dirty_ring, 0, size); + + gfnring->size = size/sizeof(struct kvm_dirty_gfn); + gfnring->soft_limit = limit; + gfnring->dirty_index = 0; + gfnring->reset_index = 0; + spin_lock_init(&gfnring->lock); + + return 0; +} + +int kvm_gfn_ring_reset(struct kvm *kvm, struct kvm_gfn_ring *gfnring) +{ + u32 cur_slot, next_slot; + u64 cur_offset, next_offset; + unsigned long mask; + u32 fetch; + int count = 0; + struct kvm_dirty_gfn *entry; + struct kvm_dirty_ring *ring = gfnring->dirty_ring; + + fetch = READ_ONCE(ring->indices.fetch_index); + if (fetch == gfnring->reset_index) + return 0; + + entry = &ring->dirty_gfns[gfnring->reset_index & + (gfnring->size - 1)]; + /* + * The ring buffer is shared with userspace, which might mmap + * it and concurrently modify slot and offset. Userspace must + * not be trusted! READ_ONCE prevents the compiler from changing + * the values after they've been range-checked (the checks are + * in kvm_reset_dirty_gfn). + */ + smp_read_barrier_depends(); + cur_slot = READ_ONCE(entry->slot); + cur_offset = READ_ONCE(entry->offset); + mask = 1; + count++; + gfnring->reset_index++; + while (gfnring->reset_index != fetch) { + entry = &ring->dirty_gfns[gfnring->reset_index & + (gfnring->size - 1)]; + smp_read_barrier_depends(); + next_slot = READ_ONCE(entry->slot); + next_offset = READ_ONCE(entry->offset); + gfnring->reset_index++; + count++; + /* + * Try to coalesce the reset operations when the guest is + * scanning pages in the same slot. + */ + if (next_slot == cur_slot) { + int delta = next_offset - cur_offset; + + if (delta >= 0 && delta < BITS_PER_LONG) { + mask |= 1ull << delta; + continue; + } + + /* Backwards visit, careful about overflows! */ + if (delta > -BITS_PER_LONG && delta < 0 && + (mask << -delta >> -delta) == mask) { + cur_offset = next_offset; + mask = (mask << -delta) | 1; + continue; + } + } + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + cur_slot = next_slot; + cur_offset = next_offset; + mask = 1; + } + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + + return count; +} + +int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring, + u32 slot, + u64 offset, + bool locked) +{ + int ret; + u16 num; + struct kvm_dirty_gfn *entry; + + if (locked) + spin_lock(&gfnring->lock); + + num = (u16)(gfnring->dirty_index - gfnring->reset_index); + if (num >= gfnring->size) { + WARN_ON_ONCE(num > gfnring->size); + ret = -EBUSY; + goto out; + } + + entry = &gfnring->dirty_ring->dirty_gfns[gfnring->dirty_index & + (gfnring->size - 1)]; + entry->slot = slot; + entry->offset = offset; + smp_wmb(); + gfnring->dirty_index++; + num = gfnring->dirty_index - gfnring->reset_index; + gfnring->dirty_ring->indices.avail_index = gfnring->dirty_index; + ret = num >= gfnring->soft_limit; + +out: + if (locked) + spin_unlock(&gfnring->lock); + + return ret; +} + +struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i) +{ + return vmalloc_to_page((void *)ring->dirty_ring+i*PAGE_SIZE); + +} + +void kvm_gfn_ring_free(struct kvm_gfn_ring *gfnring) +{ + if (gfnring->dirty_ring) + vfree(gfnring->dirty_ring); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e8b3d98..8d4b6a7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -65,9 +65,16 @@ #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> +#include <linux/kvm_gfn_ring.h> + /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +/* some buffer space for the dirty log ring for ring full situations */ +#define DIRTY_RING_BUFFER_ENTRY_NUM 16 +#endif + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -127,6 +134,12 @@ static void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gfn_t gfn); +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +static void mark_page_dirty_in_ring(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, + gfn_t gfn); +#endif __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -296,11 +309,36 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + if (kvm->dirty_ring_size) { + u32 limit = (kvm->dirty_ring_size / + sizeof(struct kvm_dirty_gfn)) - + DIRTY_RING_BUFFER_ENTRY_NUM - + kvm_cpu_dirty_log_size(); + r = kvm_gfn_ring_alloc(&vcpu->dirty_ring, + kvm->dirty_ring_size, + limit); + if (r) { + kvm->dirty_ring_size = 0; + goto fail_free_run; + } + } +#endif + r = kvm_arch_vcpu_init(vcpu); if (r < 0) +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + goto fail_free_ring; +#else goto fail_free_run; +#endif return 0; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +fail_free_ring: + if (kvm->dirty_ring_size) + kvm_gfn_ring_free(&vcpu->dirty_ring); +#endif fail_free_run: free_page((unsigned long)vcpu->run); fail: @@ -318,6 +356,10 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) put_pid(rcu_dereference_protected(vcpu->pid, 1)); kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + if (vcpu->kvm->dirty_ring_size) + kvm_gfn_ring_free(&vcpu->dirty_ring); +#endif } EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); @@ -727,6 +769,10 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + if (kvm->dirty_ring_size) + kvm_gfn_ring_free(&kvm->dirty_ring); +#endif #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else @@ -2057,6 +2103,9 @@ static void mark_page_dirty_in_slot(struct kvm *kvm, if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + mark_page_dirty_in_ring(kvm, vcpu, memslot, gfn); +#endif set_bit_le(rel_gfn, memslot->dirty_bitmap); } } @@ -2383,6 +2432,13 @@ static int kvm_vcpu_fault(struct vm_fault *vmf) else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + else if ((vmf->pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && + (vmf->pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + + vcpu->kvm->dirty_ring_size / PAGE_SIZE)) + page = kvm_gfn_ring_get_page(&vcpu->dirty_ring, + vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); +#endif else return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); @@ -2966,14 +3022,128 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) } #ifdef KVM_DIRTY_LOG_PAGE_OFFSET +static void mark_page_dirty_in_ring(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, + gfn_t gfn) +{ + struct kvm_gfn_ring *gfnlist; + u32 as_id = 0; + u64 offset; + struct kvm_vcpu *exit_vcpu; + struct kvm_vcpu *ring_vcpu; + int ret; + bool locked = false; + + if (!kvm->dirty_ring_size) + return; + + offset = gfn - slot->base_gfn; + + if (test_bit_le(offset, slot->dirty_bitmap)) + return; + + if (vcpu) { + as_id = kvm_arch_vcpu_memslots_id(vcpu); + ring_vcpu = vcpu; + } else { + as_id = 0; + ring_vcpu = kvm_get_running_vcpu(); + } + + if (ring_vcpu) { + gfnlist = &ring_vcpu->dirty_ring; + exit_vcpu = ring_vcpu; + } else { + gfnlist = &kvm->dirty_ring; + exit_vcpu = kvm->vcpus[0]; + locked = true; + } + + ret = kvm_gfn_ring_push(gfnlist, (as_id << 16)|slot->id, + offset, locked); + if (ret < 0) { + if (vcpu) + WARN_ONCE(1, "vcpu %d dirty log overflow\n", + vcpu->vcpu_id); + else + WARN_ONCE(1, "global dirty log overflow\n"); + return; + } + + if (ret) + kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, exit_vcpu); +} + +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) +{ + struct kvm_memory_slot *memslot; + int as_id, id; + + as_id = slot >> 16; + id = (u16)slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return; + + memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); + if (offset >= memslot->npages) + return; + + spin_lock(&kvm->mmu_lock); + /* FIXME: we should use a single AND operation, but there is no + * applicable atomic API. + */ + while (mask) { + clear_bit_le(offset + __ffs(mask), memslot->dirty_bitmap); + mask &= mask - 1; + } + + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); + spin_unlock(&kvm->mmu_lock); +} + static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) { - return -EINVAL; + int r; + u32 limit; + + /* the size should be power of 2 */ + if (!size || (size & (size - 1))) + return -EINVAL; + + kvm->dirty_ring_size = size; + limit = (size/sizeof(struct kvm_dirty_gfn)) - + DIRTY_RING_BUFFER_ENTRY_NUM; + r = kvm_gfn_ring_alloc(&kvm->dirty_ring, size, limit); + if (r) { + kvm_put_kvm(kvm); + return r; + } + return 0; } static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { - return -EINVAL; + int i; + struct kvm_vcpu *vcpu; + int cleared = 0; + + if (!kvm->dirty_ring_size) + return -EINVAL; + + mutex_lock(&kvm->slots_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + cleared += kvm_gfn_ring_reset(kvm, &vcpu->dirty_ring); + + cleared += kvm_gfn_ring_reset(kvm, &kvm->dirty_ring); + + mutex_unlock(&kvm->slots_lock); + + if (cleared) + kvm_flush_remote_tlbs(kvm); + + return cleared; } #endif @@ -3219,6 +3389,29 @@ static long kvm_vm_compat_ioctl(struct file *filp, } #endif +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +static int kvm_vm_fault(struct vm_fault *vmf) +{ + struct kvm *kvm = vmf->vma->vm_file->private_data; + struct page *page; + + page = kvm_gfn_ring_get_page(&kvm->dirty_ring, vmf->pgoff); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_vm_vm_ops = { + .fault = kvm_vm_fault, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} +#endif + static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, @@ -3226,6 +3419,9 @@ static long kvm_vm_compat_ioctl(struct file *filp, .compat_ioctl = kvm_vm_compat_ioctl, #endif .llseek = noop_llseek, +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + .mmap = kvm_vm_mmap, +#endif }; static int kvm_dev_ioctl_create_vm(unsigned long type)

[v3,5/6] KVM: Implement ring-based dirty memory tracking

Commit Message

Patch