diff mbox

[2/6] KVM: Dirty memory tracking for performant checkpointing and improved live migration

Message ID BL2PR08MB4817EACCCAE5EDEB0281228F0630@BL2PR08MB481.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei April 26, 2016, 7:23 p.m. UTC
Introduce memory tracking data structures and implement the new
setup/cleanup ioctls.

Chief among the KVM performance bottlenecks is the use of large
(VM-sized) bitmaps to convey the information about which pages of memory
are dirtied. These bitmaps are copied to user-space when QEMU queries
KVM for its dirty page information. The use of bitmaps makes sense in the
current live-migration method, as it is possible for all of memory to be
dirty from one log-dirty pass to another. But in a checkpoint system, the
number of dirty pages is bounded such that the VM is paused when it has
dirtied a pre-defined number of pages. Traversing a large, sparsely
populated bitmap to find set bits is time-consuming, as is copying the
bitmap to user-space.

The preferred data structure for performant checkpoint implementations is
a dense list of guest frame numbers (GFN), which also plays well with PML.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
---
 include/linux/kvm_host.h |  53 ++++++++
 virt/kvm/kvm_main.c      | 255 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 307 insertions(+), 1 deletion(-)

Comments

Kai Huang April 28, 2016, 9:05 a.m. UTC | #1
Hi,

On 4/27/2016 7:23 AM, Cao, Lei wrote:
> Introduce memory tracking data structures and implement the new
> setup/cleanup ioctls.
>
> Chief among the KVM performance bottlenecks is the use of large
> (VM-sized) bitmaps to convey the information about which pages of memory
> are dirtied. These bitmaps are copied to user-space when QEMU queries
> KVM for its dirty page information. The use of bitmaps makes sense in the
> current live-migration method, as it is possible for all of memory to be
> dirty from one log-dirty pass to another. But in a checkpoint system, the
> number of dirty pages is bounded such that the VM is paused when it has
> dirtied a pre-defined number of pages. Traversing a large, sparsely
> populated bitmap to find set bits is time-consuming, as is copying the
> bitmap to user-space.
>
> The preferred data structure for performant checkpoint implementations is
> a dense list of guest frame numbers (GFN), which also plays well with PML.
>
> Signed-off-by: Lei Cao <lei.cao@stratus.com>
> ---
>  include/linux/kvm_host.h |  53 ++++++++
>  virt/kvm/kvm_main.c      | 255 ++++++++++++++++++++++++++++++++++++-
>  2 files changed, 307 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5276fe0..5793ecf 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -358,6 +358,54 @@ struct kvm_memslots {
>  	int used_slots;
>  };
>
> +struct gfn_list {
> +	int max_dirty;
> +	__u64 *dirty_gfns; /* slot / offset */
> +	int dirty_index;
> +	int fetch_index;
> +	int reset_index;
> +	int overflow;
> +	spinlock_t lock;
> +	struct mutex mtx;
> +};
> +
> +struct vcpu_mt {
> +	struct gfn_list gfn_list;
> +};
> +
> +struct sublist_waiter {
> +	wait_queue_head_t       wq;
> +	spinlock_t              lock;
> +	int goal;
> +};
> +
> +#define MAX_DIRTY_PAGE_BATCH   (4096/sizeof(int))

4096 -> PAGE_SIZE ?

> +
> +struct kvm_mt {
> +	long            cp_id;
> +	int             active;
> +	int             mode;
> +	spinlock_t      lock;
> +	u64             max_gfn;
> +	int             quiesced;
> +	int             allow_blocking;
> +	void            *bmap;
> +	long            bmapsz;
> +	int             dirty_trigger;
> +
> +	struct gfn_list gfn_list;
> +
> +	int             tot_pages;
> +	int             fetch_count;
> +
> +	struct mutex    sublist_mtx;
> +	struct sublist_waiter sw;
> +	int             sw_busy;
> +	spinlock_t      sw_lock;
> +
> +	__u64           *gfn_buf;
> +};
> +
>  struct kvm {
>  	spinlock_t mmu_lock;
>  	struct mutex slots_lock;
> @@ -407,6 +455,11 @@ struct kvm {
>  #endif
>  	long tlbs_dirty;
>  	struct list_head devices;
> +
> +
> +	struct kvm_mt mt;
> +
> +	struct vcpu_mt vcpu_mt[KVM_MAX_VCPUS];

Why not make it embedded to kvm_vcpu?

>  };
>
>  #define kvm_err(fmt, ...) \
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 8a582e5..fe46067 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -220,10 +220,48 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
>  	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
>  }
>
> +static void init_gfn_list(struct gfn_list *gfnlist)
> +{
> +	gfnlist->dirty_gfns = NULL;
> +	gfnlist->dirty_index = 0;
> +	gfnlist->fetch_index = 0;
> +	gfnlist->reset_index = 0;
> +	gfnlist->overflow = 0;
> +}
> +
> +static void clear_gfn_list(struct gfn_list *gfnlist)
> +{
> +	init_gfn_list(gfnlist);
> +	gfnlist->max_dirty = 0;
> +}
> +
> +static void clear_kvm_mt(struct kvm *kvm)
> +{
> +	clear_gfn_list(&kvm->mt.gfn_list);
> +	kvm->mt.active = 0;
> +	kvm->mt.mode = 0;
> +	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
> +}
> +
> +static void clear_vcpu_mt(struct kvm_vcpu *vcpu)
> +{
> +	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +
> +	clear_gfn_list(gfnlist);
> +}
> +
> +static void init_vcpu_mt(struct kvm_vcpu *vcpu)
> +{
> +	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +
> +	init_gfn_list(gfnlist);
> +}
> +
>  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>  {
>  	struct page *page;
>  	int r;
> +	struct gfn_list *gfnlist;
>
>  	mutex_init(&vcpu->mutex);
>  	vcpu->cpu = -1;
> @@ -236,6 +274,11 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>  	vcpu->pre_pcpu = -1;
>  	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
>
> +	gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +	clear_vcpu_mt(vcpu);
> +	spin_lock_init(&gfnlist->lock);
> +	mutex_init(&gfnlist->mtx);
> +
>  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>  	if (!page) {
>  		r = -ENOMEM;
> @@ -602,6 +645,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
>
>  	preempt_notifier_inc();
>
> +	clear_kvm_mt(kvm);
> +	spin_lock_init(&kvm->mt.lock);
> +	spin_lock_init(&kvm->mt.sw_lock);
> +	spin_lock_init(&kvm->mt.gfn_list.lock);
> +	mutex_init(&kvm->mt.gfn_list.mtx);
> +	init_waitqueue_head(&kvm->mt.sw.wq);
> +	spin_lock_init(&kvm->mt.sw.lock);
> +	kvm->mt.sw.goal = 0;
> +	mutex_init(&kvm->mt.sublist_mtx);
> +
>  	return kvm;
>
>  out_err:
> @@ -642,11 +695,14 @@ static void kvm_destroy_devices(struct kvm *kvm)
>  	}
>  }
>
> +static int kvm_cleanup_mt(struct kvm *kvm, int force);
> +
>  static void kvm_destroy_vm(struct kvm *kvm)
>  {
>  	int i;
>  	struct mm_struct *mm = kvm->mm;
>
> +	kvm_cleanup_mt(kvm, 1);
>  	kvm_arch_sync_events(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
> @@ -2752,9 +2808,206 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  	return kvm_vm_ioctl_check_extension(kvm, arg);
>  }
>
> +static u64 kvm_get_max_gfn(struct kvm *kvm)
> +{
> +	int num_gfn = -1;
> +	struct kvm_memslots *slots = kvm_memslots(kvm);
> +	struct kvm_memory_slot *memslot;
> +	int gfn;
> +
> +	kvm_for_each_memslot(memslot, slots) {
> +		gfn = memslot->base_gfn + memslot->npages;
> +		if (gfn > num_gfn)
> +			num_gfn = gfn;
> +	}
> +	return num_gfn - 1;
> +}

This might be wrong if there are devices assigned to the guest, as each 
assigned resource also occupies a memory slot. I assume you don't want 
to monitor assigned resources, do you?

> +
> +#define DIRTY_GFN_ADD_GRANULARITY      (256)
> +
> +/*
> + * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
> + */
> +static int mt_calc_max_dirty(int goal)
> +{
> +	int val = DIRTY_GFN_ADD_GRANULARITY;
> +
> +	while (val < goal)
> +		val += DIRTY_GFN_ADD_GRANULARITY;
> +	return val;
> +}
> +
> +static int __kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
> +{
> +	struct kvm_vcpu *vcpu;
> +	int i, alloc_fail = 0;
> +	struct gfn_list *gfnlist = &kvm->mt.gfn_list;
> +	int max_dirty;
> +
> +	if (gfnlist->dirty_gfns) {
> +		pr_warn("KVM: vm %d, MT already initialized\n",
> +			current->pid);
> +		return -EBUSY;
> +	}
> +
> +	/*
> +	 * Internally, max_dirty must be a multiple of DIRTY_GFN_ADD_GRANULARITY
> +	 */
> +	max_dirty = mt_calc_max_dirty(mts->max_dirty);
> +
> +	/* probably need to make gfn lists for each vcpu to avoid locking */
> +	gfnlist->dirty_gfns = vmalloc(max_dirty *
> +					sizeof(*(gfnlist->dirty_gfns)));
> +	if (!gfnlist->dirty_gfns)
> +		return -ENOMEM;
> +
> +	if (mts->flags & KVM_MT_OPTION_NO_DUPS) {
> +		long bmapsz = sizeof(long) *
> +			((kvm->mt.max_gfn + BITS_PER_LONG)/BITS_PER_LONG);
> +		kvm->mt.bmap = vmalloc(bmapsz);
> +		if (!kvm->mt.bmap) {
> +			vfree(gfnlist->dirty_gfns);
> +			gfnlist->dirty_gfns = NULL;
> +			return -ENOMEM;
> +		}
> +		kvm->mt.bmapsz = bmapsz;
> +		memset(kvm->mt.bmap, 0, bmapsz);
> +		pr_info("KVM: vm %d, using bmap, size %ld\n",
> +			current->pid, bmapsz);
> +	}
> +
> +	gfnlist->max_dirty = max_dirty;
> +	gfnlist->dirty_index = 0;
> +
> +	kvm->mt.active = 0;
> +	kvm->mt.tot_pages = 0;
> +	kvm->mt.fetch_count = 0;
> +	kvm->mt.quiesced = 0;
> +	kvm->mt.allow_blocking = 1;
> +	kvm->mt.sw_busy = 0;
> +	kvm->mt.gfn_buf = vmalloc(max_dirty * sizeof(*(kvm->mt.gfn_buf)));
> +	if (!kvm->mt.gfn_buf) {
> +		vfree(gfnlist->dirty_gfns);
> +		gfnlist->dirty_gfns = NULL;
> +		if (kvm->mt.bmap) {
> +			vfree(kvm->mt.bmap);
> +			kvm->mt.bmap = NULL;
> +		}
> +		return -ENOMEM;
> +	}
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		init_vcpu_mt(vcpu);
> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +		gfnlist->dirty_gfns =
> +			vmalloc(max_dirty *
> +				sizeof(*(gfnlist->dirty_gfns)));
> +		if (!gfnlist->dirty_gfns) {
> +			pr_warn("KVM: vm %d, MT init, alloc fail, vcpu %d\n",
> +				current->pid, i);
> +			alloc_fail = 1;
> +			break;
> +		}
> +		gfnlist->max_dirty = max_dirty;
> +	}
> +
> +	if (alloc_fail) {
> +		/* cleanup and bail out */
> +		kvm_for_each_vcpu(i, vcpu, kvm) {
> +			gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +			if (gfnlist->dirty_gfns)
> +				vfree(gfnlist->dirty_gfns);
> +			clear_vcpu_mt(vcpu);
> +		}
> +
> +		vfree(kvm->mt.gfn_list.dirty_gfns);
> +		kvm->mt.gfn_list.dirty_gfns = NULL;
> +		kvm->mt.bmap = NULL;
> +		vfree(kvm->mt.gfn_buf);
> +		kvm->mt.gfn_buf = NULL;
> +		clear_kvm_mt(kvm);
> +		return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
> +static int kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
> +{
> +	int rc = 0;
> +
> +	if (mts->version != KVM_MT_VERSION) {
> +		pr_warn(
> +			"KVM: vm %d, MT version error, kvm %d, qemu %d\n",
> +			current->pid, KVM_MT_VERSION, mts->version);
> +		rc = -EINVAL;
> +		goto init_mt_done;
> +	}
> +
> +	if (mts->max_dirty == 0) {
> +		pr_warn("KVM: vm %d, MT max_dirty is zero?\n",
> +			current->pid);
> +		rc = -EINVAL;
> +		goto init_mt_done;
> +	}
> +
> +	kvm->mt.max_gfn = kvm_get_max_gfn(kvm);
> +
> +	rc = __kvm_init_mt(kvm, mts);
> +
> +init_mt_done:
> +	return rc;
> +}
> +
> +static int kvm_cleanup_mt(struct kvm *kvm, int force)
> +{
> +	int rc = 0;
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +
> +	if (!force && kvm->mt.active) {
> +		pr_warn("KVM: vm %d, MT still active!\n",
> +			current->pid);
> +		rc = -EBUSY;
> +		goto cleanup_mt_done;
> +	}
> +
> +	if (kvm->mt.gfn_list.dirty_gfns)
> +		vfree(kvm->mt.gfn_list.dirty_gfns);
> +
> +	clear_kvm_mt(kvm);
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		int id = vcpu->vcpu_id;
> +		struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[id].gfn_list;
> +
> +		if (gfnlist->dirty_gfns)
> +			vfree(gfnlist->dirty_gfns);
> +		clear_vcpu_mt(vcpu);
> +	}
> +
> +	if (kvm->mt.bmap)
> +		vfree(kvm->mt.bmap);
> +	kvm->mt.bmap = NULL;
> +	if (kvm->mt.gfn_buf)
> +		vfree(kvm->mt.gfn_buf);
> +	kvm->mt.gfn_buf = NULL;
> +
> +	kvm->mt.active = 0;
> +
> +cleanup_mt_done:
> +
> +	return rc;
> +}
> +
>  static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
>  {
> -	return -EINVAL;
> +	if (mts->op == KVM_MT_OP_INIT)
> +		return kvm_init_mt(kvm, mts);
> +	else if (mts->op == KVM_MT_OP_CLEANUP)
> +		return kvm_cleanup_mt(kvm, 0);
> +	else
> +		return -EINVAL;
>  }
>
>  static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei April 28, 2016, 7:28 p.m. UTC | #2
On 4/28/2016 5:05 AM, Huang, Kai wrote:
> Hi,
> 
> On 4/27/2016 7:23 AM, Cao, Lei wrote:
>> Introduce memory tracking data structures and implement the new
>> setup/cleanup ioctls.
>>
>> Chief among the KVM performance bottlenecks is the use of large
>> (VM-sized) bitmaps to convey the information about which pages of memory
>> are dirtied. These bitmaps are copied to user-space when QEMU queries
>> KVM for its dirty page information. The use of bitmaps makes sense in the
>> current live-migration method, as it is possible for all of memory to be
>> dirty from one log-dirty pass to another. But in a checkpoint system, the
>> number of dirty pages is bounded such that the VM is paused when it has
>> dirtied a pre-defined number of pages. Traversing a large, sparsely
>> populated bitmap to find set bits is time-consuming, as is copying the
>> bitmap to user-space.
>>
>> The preferred data structure for performant checkpoint implementations is
>> a dense list of guest frame numbers (GFN), which also plays well with PML.
>>
>> Signed-off-by: Lei Cao <lei.cao@stratus.com>
>> ---
>>  include/linux/kvm_host.h |  53 ++++++++
>>  virt/kvm/kvm_main.c      | 255 ++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 307 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 5276fe0..5793ecf 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -358,6 +358,54 @@ struct kvm_memslots {
>>  	int used_slots;
>>  };
>>
>> +struct gfn_list {
>> +	int max_dirty;
>> +	__u64 *dirty_gfns; /* slot / offset */
>> +	int dirty_index;
>> +	int fetch_index;
>> +	int reset_index;
>> +	int overflow;
>> +	spinlock_t lock;
>> +	struct mutex mtx;
>> +};
>> +
>> +struct vcpu_mt {
>> +	struct gfn_list gfn_list;
>> +};
>> +
>> +struct sublist_waiter {
>> +	wait_queue_head_t       wq;
>> +	spinlock_t              lock;
>> +	int goal;
>> +};
>> +
>> +#define MAX_DIRTY_PAGE_BATCH   (4096/sizeof(int))
> 
> 4096 -> PAGE_SIZE ?

Right. This define is no longer used, will remove.

> 
>> +
>> +struct kvm_mt {
>> +	long            cp_id;
>> +	int             active;
>> +	int             mode;
>> +	spinlock_t      lock;
>> +	u64             max_gfn;
>> +	int             quiesced;
>> +	int             allow_blocking;
>> +	void            *bmap;
>> +	long            bmapsz;
>> +	int             dirty_trigger;
>> +
>> +	struct gfn_list gfn_list;
>> +
>> +	int             tot_pages;
>> +	int             fetch_count;
>> +
>> +	struct mutex    sublist_mtx;
>> +	struct sublist_waiter sw;
>> +	int             sw_busy;
>> +	spinlock_t      sw_lock;
>> +
>> +	__u64           *gfn_buf;
>> +};
>> +
>>  struct kvm {
>>  	spinlock_t mmu_lock;
>>  	struct mutex slots_lock;
>> @@ -407,6 +455,11 @@ struct kvm {
>>  #endif
>>  	long tlbs_dirty;
>>  	struct list_head devices;
>> +
>> +
>> +	struct kvm_mt mt;
>> +
>> +	struct vcpu_mt vcpu_mt[KVM_MAX_VCPUS];
> 
> Why not make it embedded to kvm_vcpu?

Good idea. Will do.

> 
>>  };
>>
>>  #define kvm_err(fmt, ...) \
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 8a582e5..fe46067 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -220,10 +220,48 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
>>  	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
>>  }
>>
>> +static void init_gfn_list(struct gfn_list *gfnlist)
>> +{
>> +	gfnlist->dirty_gfns = NULL;
>> +	gfnlist->dirty_index = 0;
>> +	gfnlist->fetch_index = 0;
>> +	gfnlist->reset_index = 0;
>> +	gfnlist->overflow = 0;
>> +}
>> +
>> +static void clear_gfn_list(struct gfn_list *gfnlist)
>> +{
>> +	init_gfn_list(gfnlist);
>> +	gfnlist->max_dirty = 0;
>> +}
>> +
>> +static void clear_kvm_mt(struct kvm *kvm)
>> +{
>> +	clear_gfn_list(&kvm->mt.gfn_list);
>> +	kvm->mt.active = 0;
>> +	kvm->mt.mode = 0;
>> +	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
>> +}
>> +
>> +static void clear_vcpu_mt(struct kvm_vcpu *vcpu)
>> +{
>> +	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +	clear_gfn_list(gfnlist);
>> +}
>> +
>> +static void init_vcpu_mt(struct kvm_vcpu *vcpu)
>> +{
>> +	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +	init_gfn_list(gfnlist);
>> +}
>> +
>>  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>>  {
>>  	struct page *page;
>>  	int r;
>> +	struct gfn_list *gfnlist;
>>
>>  	mutex_init(&vcpu->mutex);
>>  	vcpu->cpu = -1;
>> @@ -236,6 +274,11 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>>  	vcpu->pre_pcpu = -1;
>>  	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
>>
>> +	gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +	clear_vcpu_mt(vcpu);
>> +	spin_lock_init(&gfnlist->lock);
>> +	mutex_init(&gfnlist->mtx);
>> +
>>  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>>  	if (!page) {
>>  		r = -ENOMEM;
>> @@ -602,6 +645,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
>>
>>  	preempt_notifier_inc();
>>
>> +	clear_kvm_mt(kvm);
>> +	spin_lock_init(&kvm->mt.lock);
>> +	spin_lock_init(&kvm->mt.sw_lock);
>> +	spin_lock_init(&kvm->mt.gfn_list.lock);
>> +	mutex_init(&kvm->mt.gfn_list.mtx);
>> +	init_waitqueue_head(&kvm->mt.sw.wq);
>> +	spin_lock_init(&kvm->mt.sw.lock);
>> +	kvm->mt.sw.goal = 0;
>> +	mutex_init(&kvm->mt.sublist_mtx);
>> +
>>  	return kvm;
>>
>>  out_err:
>> @@ -642,11 +695,14 @@ static void kvm_destroy_devices(struct kvm *kvm)
>>  	}
>>  }
>>
>> +static int kvm_cleanup_mt(struct kvm *kvm, int force);
>> +
>>  static void kvm_destroy_vm(struct kvm *kvm)
>>  {
>>  	int i;
>>  	struct mm_struct *mm = kvm->mm;
>>
>> +	kvm_cleanup_mt(kvm, 1);
>>  	kvm_arch_sync_events(kvm);
>>  	spin_lock(&kvm_lock);
>>  	list_del(&kvm->vm_list);
>> @@ -2752,9 +2808,206 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>>  	return kvm_vm_ioctl_check_extension(kvm, arg);
>>  }
>>
>> +static u64 kvm_get_max_gfn(struct kvm *kvm)
>> +{
>> +	int num_gfn = -1;
>> +	struct kvm_memslots *slots = kvm_memslots(kvm);
>> +	struct kvm_memory_slot *memslot;
>> +	int gfn;
>> +
>> +	kvm_for_each_memslot(memslot, slots) {
>> +		gfn = memslot->base_gfn + memslot->npages;
>> +		if (gfn > num_gfn)
>> +			num_gfn = gfn;
>> +	}
>> +	return num_gfn - 1;
>> +}
> 
> This might be wrong if there are devices assigned to the guest, as each 
> assigned resource also occupies a memory slot. I assume you don't want 
> to monitor assigned resources, do you?

max gfn is only used for sanity check. When we are told to mark a page dirty,
or to reset the write trap for a page, we make sure that the gfn is valid.

> 
>> +
>> +#define DIRTY_GFN_ADD_GRANULARITY      (256)
>> +
>> +/*
>> + * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
>> + */
>> +static int mt_calc_max_dirty(int goal)
>> +{
>> +	int val = DIRTY_GFN_ADD_GRANULARITY;
>> +
>> +	while (val < goal)
>> +		val += DIRTY_GFN_ADD_GRANULARITY;
>> +	return val;
>> +}
>> +
>> +static int __kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
>> +{
>> +	struct kvm_vcpu *vcpu;
>> +	int i, alloc_fail = 0;
>> +	struct gfn_list *gfnlist = &kvm->mt.gfn_list;
>> +	int max_dirty;
>> +
>> +	if (gfnlist->dirty_gfns) {
>> +		pr_warn("KVM: vm %d, MT already initialized\n",
>> +			current->pid);
>> +		return -EBUSY;
>> +	}
>> +
>> +	/*
>> +	 * Internally, max_dirty must be a multiple of DIRTY_GFN_ADD_GRANULARITY
>> +	 */
>> +	max_dirty = mt_calc_max_dirty(mts->max_dirty);
>> +
>> +	/* probably need to make gfn lists for each vcpu to avoid locking */
>> +	gfnlist->dirty_gfns = vmalloc(max_dirty *
>> +					sizeof(*(gfnlist->dirty_gfns)));
>> +	if (!gfnlist->dirty_gfns)
>> +		return -ENOMEM;
>> +
>> +	if (mts->flags & KVM_MT_OPTION_NO_DUPS) {
>> +		long bmapsz = sizeof(long) *
>> +			((kvm->mt.max_gfn + BITS_PER_LONG)/BITS_PER_LONG);
>> +		kvm->mt.bmap = vmalloc(bmapsz);
>> +		if (!kvm->mt.bmap) {
>> +			vfree(gfnlist->dirty_gfns);
>> +			gfnlist->dirty_gfns = NULL;
>> +			return -ENOMEM;
>> +		}
>> +		kvm->mt.bmapsz = bmapsz;
>> +		memset(kvm->mt.bmap, 0, bmapsz);
>> +		pr_info("KVM: vm %d, using bmap, size %ld\n",
>> +			current->pid, bmapsz);
>> +	}
>> +
>> +	gfnlist->max_dirty = max_dirty;
>> +	gfnlist->dirty_index = 0;
>> +
>> +	kvm->mt.active = 0;
>> +	kvm->mt.tot_pages = 0;
>> +	kvm->mt.fetch_count = 0;
>> +	kvm->mt.quiesced = 0;
>> +	kvm->mt.allow_blocking = 1;
>> +	kvm->mt.sw_busy = 0;
>> +	kvm->mt.gfn_buf = vmalloc(max_dirty * sizeof(*(kvm->mt.gfn_buf)));
>> +	if (!kvm->mt.gfn_buf) {
>> +		vfree(gfnlist->dirty_gfns);
>> +		gfnlist->dirty_gfns = NULL;
>> +		if (kvm->mt.bmap) {
>> +			vfree(kvm->mt.bmap);
>> +			kvm->mt.bmap = NULL;
>> +		}
>> +		return -ENOMEM;
>> +	}
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		init_vcpu_mt(vcpu);
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +		gfnlist->dirty_gfns =
>> +			vmalloc(max_dirty *
>> +				sizeof(*(gfnlist->dirty_gfns)));
>> +		if (!gfnlist->dirty_gfns) {
>> +			pr_warn("KVM: vm %d, MT init, alloc fail, vcpu %d\n",
>> +				current->pid, i);
>> +			alloc_fail = 1;
>> +			break;
>> +		}
>> +		gfnlist->max_dirty = max_dirty;
>> +	}
>> +
>> +	if (alloc_fail) {
>> +		/* cleanup and bail out */
>> +		kvm_for_each_vcpu(i, vcpu, kvm) {
>> +			gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +			if (gfnlist->dirty_gfns)
>> +				vfree(gfnlist->dirty_gfns);
>> +			clear_vcpu_mt(vcpu);
>> +		}
>> +
>> +		vfree(kvm->mt.gfn_list.dirty_gfns);
>> +		kvm->mt.gfn_list.dirty_gfns = NULL;
>> +		kvm->mt.bmap = NULL;
>> +		vfree(kvm->mt.gfn_buf);
>> +		kvm->mt.gfn_buf = NULL;
>> +		clear_kvm_mt(kvm);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
>> +{
>> +	int rc = 0;
>> +
>> +	if (mts->version != KVM_MT_VERSION) {
>> +		pr_warn(
>> +			"KVM: vm %d, MT version error, kvm %d, qemu %d\n",
>> +			current->pid, KVM_MT_VERSION, mts->version);
>> +		rc = -EINVAL;
>> +		goto init_mt_done;
>> +	}
>> +
>> +	if (mts->max_dirty == 0) {
>> +		pr_warn("KVM: vm %d, MT max_dirty is zero?\n",
>> +			current->pid);
>> +		rc = -EINVAL;
>> +		goto init_mt_done;
>> +	}
>> +
>> +	kvm->mt.max_gfn = kvm_get_max_gfn(kvm);
>> +
>> +	rc = __kvm_init_mt(kvm, mts);
>> +
>> +init_mt_done:
>> +	return rc;
>> +}
>> +
>> +static int kvm_cleanup_mt(struct kvm *kvm, int force)
>> +{
>> +	int rc = 0;
>> +	int i;
>> +	struct kvm_vcpu *vcpu;
>> +
>> +	if (!force && kvm->mt.active) {
>> +		pr_warn("KVM: vm %d, MT still active!\n",
>> +			current->pid);
>> +		rc = -EBUSY;
>> +		goto cleanup_mt_done;
>> +	}
>> +
>> +	if (kvm->mt.gfn_list.dirty_gfns)
>> +		vfree(kvm->mt.gfn_list.dirty_gfns);
>> +
>> +	clear_kvm_mt(kvm);
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		int id = vcpu->vcpu_id;
>> +		struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[id].gfn_list;
>> +
>> +		if (gfnlist->dirty_gfns)
>> +			vfree(gfnlist->dirty_gfns);
>> +		clear_vcpu_mt(vcpu);
>> +	}
>> +
>> +	if (kvm->mt.bmap)
>> +		vfree(kvm->mt.bmap);
>> +	kvm->mt.bmap = NULL;
>> +	if (kvm->mt.gfn_buf)
>> +		vfree(kvm->mt.gfn_buf);
>> +	kvm->mt.gfn_buf = NULL;
>> +
>> +	kvm->mt.active = 0;
>> +
>> +cleanup_mt_done:
>> +
>> +	return rc;
>> +}
>> +
>>  static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
>>  {
>> -	return -EINVAL;
>> +	if (mts->op == KVM_MT_OP_INIT)
>> +		return kvm_init_mt(kvm, mts);
>> +	else if (mts->op == KVM_MT_OP_CLEANUP)
>> +		return kvm_cleanup_mt(kvm, 0);
>> +	else
>> +		return -EINVAL;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kai Huang April 28, 2016, 11:47 p.m. UTC | #3
>>>
>>> +static u64 kvm_get_max_gfn(struct kvm *kvm)
>>> +{
>>> +	int num_gfn = -1;
>>> +	struct kvm_memslots *slots = kvm_memslots(kvm);
>>> +	struct kvm_memory_slot *memslot;
>>> +	int gfn;
>>> +
>>> +	kvm_for_each_memslot(memslot, slots) {
>>> +		gfn = memslot->base_gfn + memslot->npages;
>>> +		if (gfn > num_gfn)
>>> +			num_gfn = gfn;
>>> +	}
>>> +	return num_gfn - 1;
>>> +}
>>
>> This might be wrong if there are devices assigned to the guest, as each
>> assigned resource also occupies a memory slot. I assume you don't want
>> to monitor assigned resources, do you?
>
> max gfn is only used for sanity check. When we are told to mark a page dirty,
> or to reset the write trap for a page, we make sure that the gfn is valid.
>

When you were saying 'sanity check', I think you were meaning 'sanity 
check' whether the gfn is guest memory, from which point your above code 
might get a much *bigger* max_gfn than real one, as I explained above, 
which basically makes your 'sanity check' meaningless, therefore you can 
probably remove this code.

Btw 'gfn is valid' is very vague as gfn for assigned resource is 
certainly valid as well.

Thanks,
-Kai
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei April 29, 2016, 6:49 p.m. UTC | #4
On 4/28/2016 7:47 PM, Huang, Kai wrote:
> 
>>>>
>>>> +static u64 kvm_get_max_gfn(struct kvm *kvm)
>>>> +{
>>>> +	int num_gfn = -1;
>>>> +	struct kvm_memslots *slots = kvm_memslots(kvm);
>>>> +	struct kvm_memory_slot *memslot;
>>>> +	int gfn;
>>>> +
>>>> +	kvm_for_each_memslot(memslot, slots) {
>>>> +		gfn = memslot->base_gfn + memslot->npages;
>>>> +		if (gfn > num_gfn)
>>>> +			num_gfn = gfn;
>>>> +	}
>>>> +	return num_gfn - 1;
>>>> +}
>>>
>>> This might be wrong if there are devices assigned to the guest, as each
>>> assigned resource also occupies a memory slot. I assume you don't want
>>> to monitor assigned resources, do you?
>>
>> max gfn is only used for sanity check. When we are told to mark a page dirty,
>> or to reset the write trap for a page, we make sure that the gfn is valid.
>>
> 
> When you were saying 'sanity check', I think you were meaning 'sanity 
> check' whether the gfn is guest memory, from which point your above code 
> might get a much *bigger* max_gfn than real one, as I explained above, 
> which basically makes your 'sanity check' meaningless, therefore you can 
> probably remove this code.
> 
> Btw 'gfn is valid' is very vague as gfn for assigned resource is 
> certainly valid as well.
> 
> Thanks,
> -Kai
> 

Good point. Will fix that.

Thanks for your feedback, Kai.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5276fe0..5793ecf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -358,6 +358,54 @@  struct kvm_memslots {
 	int used_slots;
 };
 
+struct gfn_list {
+	int max_dirty;
+	__u64 *dirty_gfns; /* slot / offset */
+	int dirty_index;
+	int fetch_index;
+	int reset_index;
+	int overflow;
+	spinlock_t lock;
+	struct mutex mtx;
+};
+
+struct vcpu_mt {
+	struct gfn_list gfn_list;
+};
+
+struct sublist_waiter {
+	wait_queue_head_t       wq;
+	spinlock_t              lock;
+	int goal;
+};
+
+#define MAX_DIRTY_PAGE_BATCH   (4096/sizeof(int))
+
+struct kvm_mt {
+	long            cp_id;
+	int             active;
+	int             mode;
+	spinlock_t      lock;
+	u64             max_gfn;
+	int             quiesced;
+	int             allow_blocking;
+	void            *bmap;
+	long            bmapsz;
+	int             dirty_trigger;
+
+	struct gfn_list gfn_list;
+
+	int             tot_pages;
+	int             fetch_count;
+
+	struct mutex    sublist_mtx;
+	struct sublist_waiter sw;
+	int             sw_busy;
+	spinlock_t      sw_lock;
+
+	__u64           *gfn_buf;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	struct mutex slots_lock;
@@ -407,6 +455,11 @@  struct kvm {
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
+
+
+	struct kvm_mt mt;
+
+	struct vcpu_mt vcpu_mt[KVM_MAX_VCPUS];
 };
 
 #define kvm_err(fmt, ...) \
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8a582e5..fe46067 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -220,10 +220,48 @@  void kvm_reload_remote_mmus(struct kvm *kvm)
 	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
+static void init_gfn_list(struct gfn_list *gfnlist)
+{
+	gfnlist->dirty_gfns = NULL;
+	gfnlist->dirty_index = 0;
+	gfnlist->fetch_index = 0;
+	gfnlist->reset_index = 0;
+	gfnlist->overflow = 0;
+}
+
+static void clear_gfn_list(struct gfn_list *gfnlist)
+{
+	init_gfn_list(gfnlist);
+	gfnlist->max_dirty = 0;
+}
+
+static void clear_kvm_mt(struct kvm *kvm)
+{
+	clear_gfn_list(&kvm->mt.gfn_list);
+	kvm->mt.active = 0;
+	kvm->mt.mode = 0;
+	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
+}
+
+static void clear_vcpu_mt(struct kvm_vcpu *vcpu)
+{
+	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+
+	clear_gfn_list(gfnlist);
+}
+
+static void init_vcpu_mt(struct kvm_vcpu *vcpu)
+{
+	struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+
+	init_gfn_list(gfnlist);
+}
+
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
 	struct page *page;
 	int r;
+	struct gfn_list *gfnlist;
 
 	mutex_init(&vcpu->mutex);
 	vcpu->cpu = -1;
@@ -236,6 +274,11 @@  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->pre_pcpu = -1;
 	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 
+	gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+	clear_vcpu_mt(vcpu);
+	spin_lock_init(&gfnlist->lock);
+	mutex_init(&gfnlist->mtx);
+
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 		r = -ENOMEM;
@@ -602,6 +645,16 @@  static struct kvm *kvm_create_vm(unsigned long type)
 
 	preempt_notifier_inc();
 
+	clear_kvm_mt(kvm);
+	spin_lock_init(&kvm->mt.lock);
+	spin_lock_init(&kvm->mt.sw_lock);
+	spin_lock_init(&kvm->mt.gfn_list.lock);
+	mutex_init(&kvm->mt.gfn_list.mtx);
+	init_waitqueue_head(&kvm->mt.sw.wq);
+	spin_lock_init(&kvm->mt.sw.lock);
+	kvm->mt.sw.goal = 0;
+	mutex_init(&kvm->mt.sublist_mtx);
+
 	return kvm;
 
 out_err:
@@ -642,11 +695,14 @@  static void kvm_destroy_devices(struct kvm *kvm)
 	}
 }
 
+static int kvm_cleanup_mt(struct kvm *kvm, int force);
+
 static void kvm_destroy_vm(struct kvm *kvm)
 {
 	int i;
 	struct mm_struct *mm = kvm->mm;
 
+	kvm_cleanup_mt(kvm, 1);
 	kvm_arch_sync_events(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
@@ -2752,9 +2808,206 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+static u64 kvm_get_max_gfn(struct kvm *kvm)
+{
+	int num_gfn = -1;
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *memslot;
+	int gfn;
+
+	kvm_for_each_memslot(memslot, slots) {
+		gfn = memslot->base_gfn + memslot->npages;
+		if (gfn > num_gfn)
+			num_gfn = gfn;
+	}
+	return num_gfn - 1;
+}
+
+#define DIRTY_GFN_ADD_GRANULARITY      (256)
+
+/*
+ * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
+ */
+static int mt_calc_max_dirty(int goal)
+{
+	int val = DIRTY_GFN_ADD_GRANULARITY;
+
+	while (val < goal)
+		val += DIRTY_GFN_ADD_GRANULARITY;
+	return val;
+}
+
+static int __kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
+{
+	struct kvm_vcpu *vcpu;
+	int i, alloc_fail = 0;
+	struct gfn_list *gfnlist = &kvm->mt.gfn_list;
+	int max_dirty;
+
+	if (gfnlist->dirty_gfns) {
+		pr_warn("KVM: vm %d, MT already initialized\n",
+			current->pid);
+		return -EBUSY;
+	}
+
+	/*
+	 * Internally, max_dirty must be a multiple of DIRTY_GFN_ADD_GRANULARITY
+	 */
+	max_dirty = mt_calc_max_dirty(mts->max_dirty);
+
+	/* probably need to make gfn lists for each vcpu to avoid locking */
+	gfnlist->dirty_gfns = vmalloc(max_dirty *
+					sizeof(*(gfnlist->dirty_gfns)));
+	if (!gfnlist->dirty_gfns)
+		return -ENOMEM;
+
+	if (mts->flags & KVM_MT_OPTION_NO_DUPS) {
+		long bmapsz = sizeof(long) *
+			((kvm->mt.max_gfn + BITS_PER_LONG)/BITS_PER_LONG);
+		kvm->mt.bmap = vmalloc(bmapsz);
+		if (!kvm->mt.bmap) {
+			vfree(gfnlist->dirty_gfns);
+			gfnlist->dirty_gfns = NULL;
+			return -ENOMEM;
+		}
+		kvm->mt.bmapsz = bmapsz;
+		memset(kvm->mt.bmap, 0, bmapsz);
+		pr_info("KVM: vm %d, using bmap, size %ld\n",
+			current->pid, bmapsz);
+	}
+
+	gfnlist->max_dirty = max_dirty;
+	gfnlist->dirty_index = 0;
+
+	kvm->mt.active = 0;
+	kvm->mt.tot_pages = 0;
+	kvm->mt.fetch_count = 0;
+	kvm->mt.quiesced = 0;
+	kvm->mt.allow_blocking = 1;
+	kvm->mt.sw_busy = 0;
+	kvm->mt.gfn_buf = vmalloc(max_dirty * sizeof(*(kvm->mt.gfn_buf)));
+	if (!kvm->mt.gfn_buf) {
+		vfree(gfnlist->dirty_gfns);
+		gfnlist->dirty_gfns = NULL;
+		if (kvm->mt.bmap) {
+			vfree(kvm->mt.bmap);
+			kvm->mt.bmap = NULL;
+		}
+		return -ENOMEM;
+	}
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		init_vcpu_mt(vcpu);
+		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+		gfnlist->dirty_gfns =
+			vmalloc(max_dirty *
+				sizeof(*(gfnlist->dirty_gfns)));
+		if (!gfnlist->dirty_gfns) {
+			pr_warn("KVM: vm %d, MT init, alloc fail, vcpu %d\n",
+				current->pid, i);
+			alloc_fail = 1;
+			break;
+		}
+		gfnlist->max_dirty = max_dirty;
+	}
+
+	if (alloc_fail) {
+		/* cleanup and bail out */
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+			if (gfnlist->dirty_gfns)
+				vfree(gfnlist->dirty_gfns);
+			clear_vcpu_mt(vcpu);
+		}
+
+		vfree(kvm->mt.gfn_list.dirty_gfns);
+		kvm->mt.gfn_list.dirty_gfns = NULL;
+		kvm->mt.bmap = NULL;
+		vfree(kvm->mt.gfn_buf);
+		kvm->mt.gfn_buf = NULL;
+		clear_kvm_mt(kvm);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int kvm_init_mt(struct kvm *kvm, struct mt_setup *mts)
+{
+	int rc = 0;
+
+	if (mts->version != KVM_MT_VERSION) {
+		pr_warn(
+			"KVM: vm %d, MT version error, kvm %d, qemu %d\n",
+			current->pid, KVM_MT_VERSION, mts->version);
+		rc = -EINVAL;
+		goto init_mt_done;
+	}
+
+	if (mts->max_dirty == 0) {
+		pr_warn("KVM: vm %d, MT max_dirty is zero?\n",
+			current->pid);
+		rc = -EINVAL;
+		goto init_mt_done;
+	}
+
+	kvm->mt.max_gfn = kvm_get_max_gfn(kvm);
+
+	rc = __kvm_init_mt(kvm, mts);
+
+init_mt_done:
+	return rc;
+}
+
+static int kvm_cleanup_mt(struct kvm *kvm, int force)
+{
+	int rc = 0;
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	if (!force && kvm->mt.active) {
+		pr_warn("KVM: vm %d, MT still active!\n",
+			current->pid);
+		rc = -EBUSY;
+		goto cleanup_mt_done;
+	}
+
+	if (kvm->mt.gfn_list.dirty_gfns)
+		vfree(kvm->mt.gfn_list.dirty_gfns);
+
+	clear_kvm_mt(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		int id = vcpu->vcpu_id;
+		struct gfn_list *gfnlist = &vcpu->kvm->vcpu_mt[id].gfn_list;
+
+		if (gfnlist->dirty_gfns)
+			vfree(gfnlist->dirty_gfns);
+		clear_vcpu_mt(vcpu);
+	}
+
+	if (kvm->mt.bmap)
+		vfree(kvm->mt.bmap);
+	kvm->mt.bmap = NULL;
+	if (kvm->mt.gfn_buf)
+		vfree(kvm->mt.gfn_buf);
+	kvm->mt.gfn_buf = NULL;
+
+	kvm->mt.active = 0;
+
+cleanup_mt_done:
+
+	return rc;
+}
+
 static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
 {
-	return -EINVAL;
+	if (mts->op == KVM_MT_OP_INIT)
+		return kvm_init_mt(kvm, mts);
+	else if (mts->op == KVM_MT_OP_CLEANUP)
+		return kvm_cleanup_mt(kvm, 0);
+	else
+		return -EINVAL;
 }
 
 static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)