diff mbox

[3/6] KVM: Dirty memory tracking for performant checkpointing and improved live migration

Message ID BL2PR08MB4812F929A2760BC40EA757CF0630@BL2PR08MB481.namprd08.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cao, Lei April 26, 2016, 7:24 p.m. UTC
Implement the remaining memory tracking API.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
---
 arch/x86/include/asm/kvm_host.h |   5 +
 arch/x86/kvm/mmu.c              |  93 +++++
 include/uapi/linux/kvm.h        |   4 +-
 virt/kvm/kvm_main.c             | 610 +++++++++++++++++++++++++++++-
 4 files changed, 699 insertions(+), 13 deletions(-)

Comments

Kai Huang April 28, 2016, 9:13 a.m. UTC | #1
Hi,

On 4/27/2016 7:24 AM, Cao, Lei wrote:
> Implement the remaining memory tracking API.
>
> Signed-off-by: Lei Cao <lei.cao@stratus.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   5 +
>  arch/x86/kvm/mmu.c              |  93 +++++
>  include/uapi/linux/kvm.h        |   4 +-
>  virt/kvm/kvm_main.c             | 610 +++++++++++++++++++++++++++++-
>  4 files changed, 699 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b7e3944..52bff2b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1030,6 +1030,11 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
>  				   gfn_t gfn_offset, unsigned long mask);
>  void kvm_mmu_zap_all(struct kvm *kvm);
>  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
> +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm);
> +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm);
> +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset);
> +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset);
> +
>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 1ff4dbb..a36475a 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1443,6 +1443,58 @@ restart:
>  	return 0;
>  }
>
> +static struct kvm_memory_slot *kvm_memslot_from_id(struct kvm *kvm, int slot_id)
> +{
> +	int i;
> +	struct kvm_memory_slot *memslot;
> +	struct kvm_memslots *slots;
> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		slots = __kvm_memslots(kvm, i);
> +		kvm_for_each_memslot(memslot, slots) {
> +			if (memslot->id == slot_id)
> +				return memslot;
> +		}
> +	}
> +	return NULL;
> +}
> +
> +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset)
> +{
> +	struct kvm_memory_slot *slot;
> +	int slot_id;
> +	gfn_t offset;
> +
> +	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
> +	slot = kvm_memslot_from_id(kvm, slot_id);
> +	if (slot == NULL) {
> +		pr_warn("KVM: bad slot_id %d\n", slot_id);
> +		return kvm->mt.max_gfn+1;
> +	}
> +	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
> +	return offset + slot->base_gfn;
> +}
> +
> +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset)
> +{
> +	struct kvm_memory_slot *slot;
> +	int slot_id;
> +	gfn_t offset, gfn;
> +
> +	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
> +	slot = kvm_memslot_from_id(kvm, slot_id);
> +	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
> +	gfn = offset + slot->base_gfn;
> +
> +	if (gfn > kvm->mt.max_gfn) {
> +		pr_warn("KVM: bad gfn %lx\n", (long)gfn);
> +		return 0;
> +	}
> +
> +	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot, offset, 1);
> +	return 1;
> +}
> +
>  struct slot_rmap_walk_iterator {
>  	/* input fields. */
>  	struct kvm_memory_slot *slot;
> @@ -4762,6 +4814,47 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
>  		kvm_flush_remote_tlbs(kvm);
>  }
>
> +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvm_memslots *slots;
> +	struct kvm_memory_slot *memslot;
> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		slots = __kvm_memslots(kvm, i);
> +
> +		kvm_for_each_memslot(memslot, slots) {
> +			if (memslot->id < KVM_USER_MEM_SLOTS) {
> +				if (kvm_x86_ops->slot_enable_log_dirty)
> +					kvm_x86_ops->slot_enable_log_dirty(kvm,
> +						memslot);
> +				else
> +					kvm_mmu_slot_remove_write_access(kvm,
> +						memslot);
> +			}
> +		}
> +	}
> +}
> +
> +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvm_memslots *slots;
> +	struct kvm_memory_slot *memslot;
> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		slots = __kvm_memslots(kvm, i);
> +
> +		kvm_for_each_memslot(memslot, slots) {
> +			if (memslot->id < KVM_USER_MEM_SLOTS) {
> +				if (kvm_x86_ops->slot_disable_log_dirty)
> +					kvm_x86_ops->slot_disable_log_dirty(kvm,
> +						memslot);
> +			}
> +		}
> +	}
> +}
> +
>  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>  					 struct kvm_rmap_head *rmap_head)
>  {
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2bce4db..736668d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1344,11 +1344,11 @@ struct mt_enable {
>  #define MT_OFFSET_MASK		(0x0000ffffffffffffUL)
>
>  #define MT_MAKE_SLOT_OFFSET(slot, offset)			\
> -	do {							\
> +	({							\
>  		__u64 slot_off = offset & MT_OFFSET_MASK;	\
>  		slot_off |= ((__u64)slot << 48);		\
>  		slot_off;					\
> -	} while (0)
> +	})
>
>  #define MT_OFFSET_FROM_SLOT_OFFSET(slot_off)		\
>  	(slot_off & MT_OFFSET_MASK)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fe46067..ba99cbc6 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1795,8 +1795,12 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
>
> -static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
> -			          const void *data, int offset, int len)
> +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
> +	gfn_t gfn, struct kvm_vcpu *vcpu);

One general comment: won't it be better if you devide kvm_mt and make it 
embedded to kvm_memory_slot? In my understanding the main difference 
between bitmap and your log-dirty mechanism is you are using list not 
bitmap, and I think make the dirty_gfn_list embedded to kvm_memory_slot 
should simplify lots of your code.

Thanks,
-Kai

> +
> +static int __kvm_write_guest_page(struct kvm *kvm,
> +				struct kvm_memory_slot *memslot, gfn_t gfn,
> +				const void *data, int offset, int len)
>  {
>  	int r;
>  	unsigned long addr;
> @@ -1808,6 +1812,8 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
>  	if (r)
>  		return -EFAULT;
>  	mark_page_dirty_in_slot(memslot, gfn);
> +	if (memslot && (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS))
> +		mt_mark_page_dirty(kvm, memslot, gfn, NULL);
>  	return 0;
>  }
>
> @@ -1816,7 +1822,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
>  {
>  	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>
> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
> +	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
>  }
>  EXPORT_SYMBOL_GPL(kvm_write_guest_page);
>
> @@ -1825,7 +1831,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
>  {
>  	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>
> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
> +	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
>
> @@ -1929,6 +1935,10 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  	if (r)
>  		return -EFAULT;
>  	mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
> +	if (ghc->memslot && (ghc->memslot->id >= 0 &&
> +		ghc->memslot->id < KVM_USER_MEM_SLOTS))
> +		mt_mark_page_dirty(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT,
> +			NULL);
>
>  	return 0;
>  }
> @@ -1996,11 +2006,95 @@ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
>  	}
>  }
>
> +/*
> + * We have some new dirty pages for our sublist waiter.  Enough to merit
> + * waking it up?
> + */
> +static void mt_sw_add_pages(struct kvm *kvm)
> +{
> +	int avail = kvm->mt.tot_pages - kvm->mt.fetch_count;
> +	struct sublist_waiter *swp = &kvm->mt.sw;
> +
> +	spin_lock(&kvm->mt.sw_lock);
> +
> +	if (swp->goal && (avail >= swp->goal)) {
> +		kvm->mt.fetch_count += avail;
> +		swp->goal = 0;
> +		wake_up(&swp->wq);
> +	}
> +
> +	spin_unlock(&kvm->mt.sw_lock);
> +}
> +
> +#define DIRTY_GFN_ADD_GRANULARITY      (256)
> +
> +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
> +	gfn_t gfn, struct kvm_vcpu *vcpu)
> +{
> +	int use_kvm;            /* add to global list? */
> +	struct gfn_list *gfnlist;
> +	int slot_id = slot->id;
> +	__u64 offset = gfn - slot->base_gfn;
> +	__u64 slot_offset;
> +
> +	/*
> +	 * Try to add dirty page to vcpu list.  If vcpu is NULL or
> +	 * vcpu list is full, then try to add to kvm master list.
> +	 */
> +
> +	if (!kvm->mt.active)
> +		return;
> +
> +	if (slot->id >= KVM_USER_MEM_SLOTS)
> +		return;
> +
> +	if (gfn > kvm->mt.max_gfn)
> +		return;
> +
> +	/* if we're avoiding duplicates, is this one already marked? */
> +	if (kvm->mt.bmap && test_and_set_bit(gfn, kvm->mt.bmap))
> +		return;
> +
> +	slot_offset = MT_MAKE_SLOT_OFFSET(slot_id, offset);
> +
> +	use_kvm = (vcpu == NULL);
> +
> +	if (vcpu) {
> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +		if (gfnlist->dirty_index == gfnlist->max_dirty) {
> +			use_kvm = 1;
> +			gfnlist->overflow = 1;
> +			/* Fall back to master gfn list.*/
> +			gfnlist = &kvm->mt.gfn_list;
> +		}
> +	} else {
> +		gfnlist = &kvm->mt.gfn_list;
> +	}
> +
> +	spin_lock(&gfnlist->lock);
> +	if (gfnlist->dirty_index >= gfnlist->max_dirty) {
> +		gfnlist->overflow = 1;
> +	} else {
> +		gfnlist->dirty_gfns[gfnlist->dirty_index++] = slot_offset;
> +		if ((gfnlist->dirty_index % DIRTY_GFN_ADD_GRANULARITY) == 0) {
> +			spin_lock(&kvm->mt.lock);
> +			kvm->mt.tot_pages += DIRTY_GFN_ADD_GRANULARITY;
> +			mt_sw_add_pages(kvm);
> +			spin_unlock(&kvm->mt.lock);
> +		}
> +	}
> +	spin_unlock(&gfnlist->lock);
> +}
> +
>  void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
>  {
>  	struct kvm_memory_slot *memslot;
>
>  	memslot = gfn_to_memslot(kvm, gfn);
> +	if (memslot) {
> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
> +			mt_mark_page_dirty(kvm, memslot, gfn, NULL);
> +	}
>  	mark_page_dirty_in_slot(memslot, gfn);
>  }
>  EXPORT_SYMBOL_GPL(mark_page_dirty);
> @@ -2010,6 +2104,10 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>  	struct kvm_memory_slot *memslot;
>
>  	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> +	if (memslot) {
> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
> +			mt_mark_page_dirty(vcpu->kvm, memslot, gfn, vcpu);
> +	}
>  	mark_page_dirty_in_slot(memslot, gfn);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
> @@ -2823,8 +2921,6 @@ static u64 kvm_get_max_gfn(struct kvm *kvm)
>  	return num_gfn - 1;
>  }
>
> -#define DIRTY_GFN_ADD_GRANULARITY      (256)
> -
>  /*
>   * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
>   */
> @@ -3010,31 +3106,523 @@ static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
>  		return -EINVAL;
>  }
>
> +static int kvm_enable_mt(struct kvm *kvm)
> +{
> +	int rc = 0;
> +
> +	if (kvm->mt.active) {
> +		pr_warn("KVM: vm %d, MT already active\n",
> +			current->pid);
> +		rc = -EINVAL;
> +		goto enable_mt_done;
> +	}
> +
> +	kvm_mmu_mt_enable_log_dirty(kvm);
> +	if (kvm->mt.bmap)
> +		memset(kvm->mt.bmap, 0, kvm->mt.bmapsz);
> +
> +	kvm->mt.active = 1;
> +
> +enable_mt_done:
> +
> +	return rc;
> +}
> +
> +static int kvm_disable_mt(struct kvm *kvm)
> +{
> +	int rc = 0;
> +
> +	if (!kvm->mt.active) {
> +		pr_warn("KVM: vm %d, MT already disabled\n",
> +			current->pid);
> +		rc = -EINVAL;
> +		goto disable_mt_done;
> +	}
> +
> +	kvm_mmu_mt_disable_log_dirty(kvm);
> +	kvm->mt.active = 0;
> +
> +disable_mt_done:
> +
> +	return rc;
> +}
> +
>  static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
>  {
> -	return -EINVAL;
> +	if ((mte->flags & 0x1) == 1)
> +		return kvm_enable_mt(kvm);
> +	else if ((mte->flags & 0x1) == 0)
> +		return kvm_disable_mt(kvm);
> +	else
> +		return -EINVAL;
>  }
>
>  static int kvm_vm_ioctl_mt_prepare_cp(struct kvm *kvm,
>  				      struct mt_prepare_cp *mtpcp)
>  {
> -	return -EINVAL;
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +	struct gfn_list *gfnlist;
> +
> +	if (!kvm->mt.active)
> +		return -EINVAL;
> +
> +	kvm->mt.cp_id = mtpcp->cpid;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +		spin_lock(&gfnlist->lock);
> +		gfnlist->fetch_index = 0;
> +		gfnlist->reset_index = 0;
> +		gfnlist->dirty_index = 0;
> +		gfnlist->overflow = 0;
> +		spin_unlock(&gfnlist->lock);
> +	}
> +
> +	gfnlist = &kvm->mt.gfn_list;
> +	spin_lock(&gfnlist->lock);
> +	gfnlist->fetch_index = 0;
> +	gfnlist->reset_index = 0;
> +	gfnlist->dirty_index = 0;
> +	gfnlist->overflow = 0;
> +	spin_unlock(&gfnlist->lock);
> +
> +	kvm->mt.quiesced = 0;
> +	kvm->mt.allow_blocking = 1;
> +	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
> +
> +	return 0;
> +}
> +
> +static bool mt_reset_gfn(struct kvm *kvm, u64 slot_offset)
> +{
> +	gfn_t gfn;
> +
> +	gfn = kvm_mt_slot_offset_to_gfn(kvm, slot_offset);
> +	if (gfn > kvm->mt.max_gfn)
> +		return 0;
> +
> +	if (kvm->mt.bmap) {
> +		if (kvm->mt.quiesced) {
> +			/*
> +			 * Goal is to reset entire bmap, but don't need
> +			 * atomics if we are quiesced
> +			 */
> +			int offset32 = gfn/32;
> +			int *p = (int *)(kvm->mt.bmap) + offset32;
> +			*p = 0;
> +		} else {
> +			clear_bit(gfn, kvm->mt.bmap);
> +		}
> +	}
> +
> +	return kvm_mt_mmu_reset_gfn(kvm, slot_offset);
> +}
> +
> +#define GFN_RESET_BATCH        (64)
> +
> +static int mt_reset_all_gfns(struct kvm *kvm)
> +{
> +	int i, j;
> +	struct kvm_vcpu *vcpu;
> +	struct gfn_list *gfnlist;
> +	bool cleared = false;
> +	int reset_start, count, avail;
> +
> +	if (!kvm->mt.active)
> +		return -EINVAL;
> +
> +	if (!kvm->mt.quiesced)
> +		return -EINVAL;
> +
> +	spin_lock(&kvm->mmu_lock);
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +
> +vcpu_gfn_loop:
> +
> +		spin_lock(&gfnlist->lock);
> +		reset_start = gfnlist->reset_index;
> +		avail = gfnlist->dirty_index - gfnlist->reset_index;
> +		count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
> +		gfnlist->reset_index += count;
> +		spin_unlock(&gfnlist->lock);
> +
> +		for (j = reset_start; j < reset_start + count; j++)
> +			cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
> +
> +		if (count)
> +			goto vcpu_gfn_loop;
> +	}
> +
> +	gfnlist = &kvm->mt.gfn_list;
> +
> +global_gfn_loop:
> +
> +	spin_lock(&gfnlist->lock);
> +	reset_start = gfnlist->reset_index;
> +	avail = gfnlist->dirty_index - gfnlist->reset_index;
> +	count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
> +	gfnlist->reset_index += count;
> +	spin_unlock(&gfnlist->lock);
> +
> +	for (j = reset_start; j < reset_start + count; j++)
> +		cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
> +
> +	if (count)
> +		goto global_gfn_loop;
> +
> +	spin_unlock(&kvm->mmu_lock);
> +
> +
> +	if (cleared)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return 0;
>  }
>
>  static int kvm_vm_ioctl_mt_rearm_gfns(struct kvm *kvm)
>  {
> -	return -EINVAL;
> +	return mt_reset_all_gfns(kvm);
> +}
> +
> +static int mt_unblock_sw(struct kvm *kvm)
> +{
> +	struct sublist_waiter *swp;
> +
> +	if (!kvm->mt.active)
> +		return -EINVAL;
> +
> +	spin_lock(&kvm->mt.sw_lock);
> +
> +	kvm->mt.allow_blocking = 0;
> +
> +	/* Make sure allow_blocking is clear before the wake up */
> +	mb();
> +
> +	swp = &kvm->mt.sw;
> +	wake_up(&swp->wq);
> +
> +	spin_unlock(&kvm->mt.sw_lock);
> +
> +	return 0;
>  }
>
>  static int kvm_vm_ioctl_mt_quiesced(struct kvm *kvm)
>  {
> -	return -EINVAL;
> +	if (!kvm->mt.active)
> +		return -EINVAL;
> +
> +	kvm->mt.quiesced = 1;
> +
> +	/* wake up the sublist waiter */
> +	mt_unblock_sw(kvm);
> +
> +	if (kvm->mt.gfn_list.overflow)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static int mt_sublist_req_nowait(struct kvm *kvm,
> +				struct mt_sublist_fetch_info *msfi, int offset)
> +{
> +	int i, j, avail, goal = msfi->gfn_info.count;
> +	struct kvm_vcpu *vcpu;
> +	__u64 *gfndst, *gfnsrc;
> +	int rc = 0;
> +	__u64 slot_offset;
> +	int index;
> +
> +	/* Clearing dirty/write bits requires tlb flush before exit */
> +	int cleared = 0;
> +
> +	/* Don't need to lock gfn lists if we're in VM blackout */
> +	int need_locks = !kvm->mt.quiesced;
> +
> +	/* Consolidate flags */
> +	int reset = msfi->flags & MT_FETCH_REARM;
> +	int bmap = kvm->mt.bmap != NULL;
> +
> +	if (goal == 0)
> +		return 0;
> +
> +	gfndst = &msfi->gfn_info.gfnlist[offset];
> +	msfi->gfn_info.count = offset;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		int len, rem;
> +		int vcpu_id;
> +		struct gfn_list *gfnlist;
> +
> +		vcpu_id = vcpu->vcpu_id;
> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu_id].gfn_list;
> +
> +		mutex_lock(&gfnlist->mtx);
> +		if (need_locks)
> +			spin_lock(&gfnlist->lock);
> +
> +		avail = gfnlist->dirty_index - gfnlist->fetch_index;
> +		if (!avail) {
> +			if (need_locks)
> +				spin_unlock(&gfnlist->lock);
> +				mutex_unlock(&gfnlist->mtx);
> +			continue;
> +		}
> +		avail = avail > goal ? goal : avail;
> +		for (j = 0; j < avail; j++) {
> +			index = gfnlist->fetch_index+j;
> +			slot_offset = gfnlist->dirty_gfns[index];
> +			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
> +						slot_offset);
> +		}
> +		gfnsrc = &kvm->mt.gfn_buf[0];
> +
> +		if (need_locks)
> +			spin_unlock(&gfnlist->lock);
> +
> +		rem = copy_to_user(gfndst, gfnsrc,
> +				avail*sizeof(*gfndst)) / sizeof(*gfndst);
> +
> +		/*
> +		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
> +		 * below, but must take mmu_lock _before_ gfnlist lock.
> +		 */
> +		if (reset)
> +			spin_lock(&kvm->mmu_lock);
> +
> +		if (need_locks)
> +			spin_lock(&gfnlist->lock);
> +
> +		len = avail - rem;
> +		msfi->gfn_info.count += len;
> +		gfndst += len;
> +		if (reset) {
> +			__u64 gfn;
> +
> +			for (j = 0; j < len; j++) {
> +				index = gfnlist->fetch_index+j;
> +				slot_offset = gfnlist->dirty_gfns[index];
> +				gfn = kvm_mt_slot_offset_to_gfn(kvm,
> +					slot_offset);
> +				cleared +=
> +					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
> +				if (bmap)
> +					clear_bit(gfn, kvm->mt.bmap);
> +			}
> +			gfnlist->reset_index += len;
> +		}
> +		gfnlist->fetch_index += len;
> +
> +		if (need_locks)
> +			spin_unlock(&gfnlist->lock);
> +		if (reset)
> +			spin_unlock(&kvm->mmu_lock);
> +		mutex_unlock(&gfnlist->mtx);
> +
> +		if (len != avail) {
> +			rc = -EFAULT;
> +			goto copy_done_err;
> +		}
> +
> +		goal -= avail;
> +		if (goal == 0)
> +			break;
> +	}
> +
> +	/* If we still need more gfns, consult the master list */
> +	if (goal) {
> +		int len, rem;
> +		struct gfn_list *gfnlist = &kvm->mt.gfn_list;
> +
> +		mutex_lock(&gfnlist->mtx);
> +		if (need_locks)
> +			spin_lock(&gfnlist->lock);
> +
> +		avail = gfnlist->dirty_index - gfnlist->fetch_index;
> +		if (!avail) {
> +			if (need_locks)
> +				spin_unlock(&gfnlist->lock);
> +			mutex_unlock(&gfnlist->mtx);
> +			goto copy_done_no_err;
> +		}
> +		avail = avail > goal ? goal : avail;
> +		for (j = 0; j < avail; j++) {
> +			index = gfnlist->fetch_index+j;
> +			slot_offset = gfnlist->dirty_gfns[index];
> +			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
> +						slot_offset);
> +		}
> +		gfnsrc = &kvm->mt.gfn_buf[0];
> +
> +		if (need_locks)
> +			spin_unlock(&gfnlist->lock);
> +
> +		rem = copy_to_user(gfndst, gfnsrc,
> +				avail*sizeof(*gfndst)) / sizeof(*gfndst);
> +
> +		/*
> +		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
> +		 * below, but must take mmu_lock _before_ gfnlist lock.
> +		 */
> +		if (reset)
> +			spin_lock(&kvm->mmu_lock);
> +
> +		if (need_locks)
> +			spin_lock(&gfnlist->lock);
> +
> +		len = avail - rem;
> +		msfi->gfn_info.count += len;
> +		gfnlist->fetch_index += len;
> +		if (reset) {
> +			__u64 slot_offset;
> +			__u64 gfn;
> +
> +			for (j = 0; j < len; j++) {
> +				index = gfnlist->fetch_index+j;
> +				slot_offset = gfnlist->dirty_gfns[index];
> +				gfn = kvm_mt_slot_offset_to_gfn(kvm,
> +					slot_offset);
> +				cleared +=
> +					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
> +				if (bmap)
> +					clear_bit(gfn, kvm->mt.bmap);
> +			}
> +			gfnlist->reset_index += len;
> +		}
> +
> +		if (need_locks)
> +			spin_unlock(&gfnlist->lock);
> +		if (reset)
> +			spin_unlock(&kvm->mmu_lock);
> +		mutex_unlock(&gfnlist->mtx);
> +
> +		if (len != avail) {
> +			rc = -EFAULT;
> +			goto copy_done_err;
> +		}
> +
> +		goal -= avail;
> +	}
> +
> +copy_done_no_err:
> +
> +copy_done_err:
> +
> +	if (cleared)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return rc;
> +}
> +
> +static int mt_sublist_req_wait(struct kvm *kvm,
> +				struct mt_sublist_fetch_info *msfi)
> +{
> +	struct sublist_waiter *swp;
> +	int goal = msfi->gfn_info.count;
> +	int offset;
> +	int rc;
> +
> +	if (msfi->gfn_info.count == 0)
> +		return 0;
> +
> +	spin_lock(&kvm->mt.sw_lock);
> +	if (!kvm->mt.allow_blocking) {
> +		spin_unlock(&kvm->mt.sw_lock);
> +		return -EINVAL;
> +	}
> +	spin_unlock(&kvm->mt.sw_lock);
> +
> +	rc = mt_sublist_req_nowait(kvm, msfi, 0);
> +	if (rc || (msfi->gfn_info.count == goal))
> +		return rc;
> +
> +	offset = msfi->gfn_info.count;
> +
> +	spin_lock(&kvm->mt.sw_lock);
> +
> +	if (kvm->mt.sw_busy) {
> +		spin_unlock(&kvm->mt.sw_lock);
> +		return -EBUSY;
> +	}
> +	kvm->mt.sw_busy = 1;
> +
> +	swp = &kvm->mt.sw;
> +	swp->goal = goal;
> +
> +	spin_unlock(&kvm->mt.sw_lock);
> +
> +	rc = wait_event_interruptible(swp->wq,
> +			!kvm->mt.allow_blocking || !swp->goal);
> +
> +	spin_lock(&kvm->mt.sw_lock);
> +
> +	kvm->mt.sw_busy = 0;
> +
> +	spin_unlock(&kvm->mt.sw_lock);
> +
> +	if (rc)
> +		return rc;
> +
> +	msfi->gfn_info.count = goal - offset;
> +
> +	return mt_sublist_req_nowait(kvm, msfi, offset);
> +}
> +
> +static int mt_get_dirty_count(struct kvm *kvm,
> +				struct mt_sublist_fetch_info *msfi)
> +{
> +	int i, avail = 0;
> +	struct kvm_vcpu *vcpu;
> +	struct gfn_list *gfnlist;
> +
> +	/* Don't need to lock gfn lists if we're in VM blackout */
> +	int need_locks = !kvm->mt.quiesced;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
> +
> +		mutex_lock(&gfnlist->mtx);
> +		if (need_locks)
> +			spin_lock(&gfnlist->lock);
> +		avail += gfnlist->dirty_index - gfnlist->fetch_index;
> +		if (need_locks)
> +			spin_unlock(&gfnlist->lock);
> +		mutex_unlock(&gfnlist->mtx);
> +	}
> +
> +	gfnlist = &kvm->mt.gfn_list;
> +
> +	mutex_lock(&gfnlist->mtx);
> +	if (need_locks)
> +		spin_lock(&gfnlist->lock);
> +	avail += gfnlist->dirty_index - gfnlist->fetch_index;
> +	if (need_locks)
> +		spin_unlock(&gfnlist->lock);
> +	mutex_unlock(&gfnlist->mtx);
> +
> +	msfi->gfn_info.count = avail;
> +
> +	return 0;
>  }
>
>  static int kvm_vm_ioctl_mt_sublist_fetch(struct kvm *kvm,
>  					 struct mt_sublist_fetch_info *mtsfi)
>  {
> -	return -EINVAL;
> +	if (!kvm->mt.active)
> +		return -EINVAL;
> +
> +	if (mtsfi->gfn_info.gfnlist == NULL)
> +		return mt_get_dirty_count(kvm, mtsfi);
> +
> +	if (mtsfi->gfn_info.count == 0)
> +		return 0;
> +
> +	if (!(mtsfi->flags & MT_FETCH_WAIT))
> +		return mt_sublist_req_nowait(kvm, mtsfi, 0);
> +
> +	return mt_sublist_req_wait(kvm, mtsfi);
>  }
>
>  static int kvm_vm_ioctl_mt_dirty_trigger(struct kvm *kvm, int dirty_trigger)
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei April 28, 2016, 7:58 p.m. UTC | #2
On 4/28/2016 5:13 AM, Huang, Kai wrote:
> Hi,
> 
> On 4/27/2016 7:24 AM, Cao, Lei wrote:
>> Implement the remaining memory tracking API.
>>
>> Signed-off-by: Lei Cao <lei.cao@stratus.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |   5 +
>>  arch/x86/kvm/mmu.c              |  93 +++++
>>  include/uapi/linux/kvm.h        |   4 +-
>>  virt/kvm/kvm_main.c             | 610 +++++++++++++++++++++++++++++-
>>  4 files changed, 699 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index b7e3944..52bff2b 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1030,6 +1030,11 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
>>  				   gfn_t gfn_offset, unsigned long mask);
>>  void kvm_mmu_zap_all(struct kvm *kvm);
>>  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
>> +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm);
>> +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm);
>> +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset);
>> +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset);
>> +
>>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 1ff4dbb..a36475a 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1443,6 +1443,58 @@ restart:
>>  	return 0;
>>  }
>>
>> +static struct kvm_memory_slot *kvm_memslot_from_id(struct kvm *kvm, int slot_id)
>> +{
>> +	int i;
>> +	struct kvm_memory_slot *memslot;
>> +	struct kvm_memslots *slots;
>> +
>> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>> +		slots = __kvm_memslots(kvm, i);
>> +		kvm_for_each_memslot(memslot, slots) {
>> +			if (memslot->id == slot_id)
>> +				return memslot;
>> +		}
>> +	}
>> +	return NULL;
>> +}
>> +
>> +gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset)
>> +{
>> +	struct kvm_memory_slot *slot;
>> +	int slot_id;
>> +	gfn_t offset;
>> +
>> +	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
>> +	slot = kvm_memslot_from_id(kvm, slot_id);
>> +	if (slot == NULL) {
>> +		pr_warn("KVM: bad slot_id %d\n", slot_id);
>> +		return kvm->mt.max_gfn+1;
>> +	}
>> +	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
>> +	return offset + slot->base_gfn;
>> +}
>> +
>> +int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset)
>> +{
>> +	struct kvm_memory_slot *slot;
>> +	int slot_id;
>> +	gfn_t offset, gfn;
>> +
>> +	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
>> +	slot = kvm_memslot_from_id(kvm, slot_id);
>> +	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
>> +	gfn = offset + slot->base_gfn;
>> +
>> +	if (gfn > kvm->mt.max_gfn) {
>> +		pr_warn("KVM: bad gfn %lx\n", (long)gfn);
>> +		return 0;
>> +	}
>> +
>> +	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot, offset, 1);
>> +	return 1;
>> +}
>> +
>>  struct slot_rmap_walk_iterator {
>>  	/* input fields. */
>>  	struct kvm_memory_slot *slot;
>> @@ -4762,6 +4814,47 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
>>  		kvm_flush_remote_tlbs(kvm);
>>  }
>>
>> +void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm)
>> +{
>> +	int i;
>> +	struct kvm_memslots *slots;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>> +		slots = __kvm_memslots(kvm, i);
>> +
>> +		kvm_for_each_memslot(memslot, slots) {
>> +			if (memslot->id < KVM_USER_MEM_SLOTS) {
>> +				if (kvm_x86_ops->slot_enable_log_dirty)
>> +					kvm_x86_ops->slot_enable_log_dirty(kvm,
>> +						memslot);
>> +				else
>> +					kvm_mmu_slot_remove_write_access(kvm,
>> +						memslot);
>> +			}
>> +		}
>> +	}
>> +}
>> +
>> +void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm)
>> +{
>> +	int i;
>> +	struct kvm_memslots *slots;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>> +		slots = __kvm_memslots(kvm, i);
>> +
>> +		kvm_for_each_memslot(memslot, slots) {
>> +			if (memslot->id < KVM_USER_MEM_SLOTS) {
>> +				if (kvm_x86_ops->slot_disable_log_dirty)
>> +					kvm_x86_ops->slot_disable_log_dirty(kvm,
>> +						memslot);
>> +			}
>> +		}
>> +	}
>> +}
>> +
>>  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>>  					 struct kvm_rmap_head *rmap_head)
>>  {
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 2bce4db..736668d 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1344,11 +1344,11 @@ struct mt_enable {
>>  #define MT_OFFSET_MASK		(0x0000ffffffffffffUL)
>>
>>  #define MT_MAKE_SLOT_OFFSET(slot, offset)			\
>> -	do {							\
>> +	({							\
>>  		__u64 slot_off = offset & MT_OFFSET_MASK;	\
>>  		slot_off |= ((__u64)slot << 48);		\
>>  		slot_off;					\
>> -	} while (0)
>> +	})
>>
>>  #define MT_OFFSET_FROM_SLOT_OFFSET(slot_off)		\
>>  	(slot_off & MT_OFFSET_MASK)
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index fe46067..ba99cbc6 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1795,8 +1795,12 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
>>
>> -static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
>> -			          const void *data, int offset, int len)
>> +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>> +	gfn_t gfn, struct kvm_vcpu *vcpu);
> 
> One general comment: won't it be better if you devide kvm_mt and make it 
> embedded to kvm_memory_slot? In my understanding the main difference 
> between bitmap and your log-dirty mechanism is you are using list not 
> bitmap, and I think make the dirty_gfn_list embedded to kvm_memory_slot 
> should simplify lots of your code.
> 
> Thanks,
> -Kai
> 

It's true that one difference of the new mechanism is the use of list 
instead of bitmap. Another difference is that the dirty list is per
vcpu, instead of per memory slot. This is so that the list can be updated
without holding a lock. 

It should be noted that what is saved on the dirty list is 
(mem slot id|offset), not gfn. (See mt_mark_page_dirty()) Dirty list is 
in fact named "gfnlist" throughout the code, it probably causes confusion.
I'll fix it.

>> +
>> +static int __kvm_write_guest_page(struct kvm *kvm,
>> +				struct kvm_memory_slot *memslot, gfn_t gfn,
>> +				const void *data, int offset, int len)
>>  {
>>  	int r;
>>  	unsigned long addr;
>> @@ -1808,6 +1812,8 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
>>  	if (r)
>>  		return -EFAULT;
>>  	mark_page_dirty_in_slot(memslot, gfn);
>> +	if (memslot && (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS))
>> +		mt_mark_page_dirty(kvm, memslot, gfn, NULL);
>>  	return 0;
>>  }
>>
>> @@ -1816,7 +1822,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
>>  {
>>  	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>>
>> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
>> +	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_write_guest_page);
>>
>> @@ -1825,7 +1831,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
>>  {
>>  	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>>
>> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
>> +	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
>>
>> @@ -1929,6 +1935,10 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>>  	if (r)
>>  		return -EFAULT;
>>  	mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
>> +	if (ghc->memslot && (ghc->memslot->id >= 0 &&
>> +		ghc->memslot->id < KVM_USER_MEM_SLOTS))
>> +		mt_mark_page_dirty(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT,
>> +			NULL);
>>
>>  	return 0;
>>  }
>> @@ -1996,11 +2006,95 @@ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
>>  	}
>>  }
>>
>> +/*
>> + * We have some new dirty pages for our sublist waiter.  Enough to merit
>> + * waking it up?
>> + */
>> +static void mt_sw_add_pages(struct kvm *kvm)
>> +{
>> +	int avail = kvm->mt.tot_pages - kvm->mt.fetch_count;
>> +	struct sublist_waiter *swp = &kvm->mt.sw;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	if (swp->goal && (avail >= swp->goal)) {
>> +		kvm->mt.fetch_count += avail;
>> +		swp->goal = 0;
>> +		wake_up(&swp->wq);
>> +	}
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +}
>> +
>> +#define DIRTY_GFN_ADD_GRANULARITY      (256)
>> +
>> +static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
>> +	gfn_t gfn, struct kvm_vcpu *vcpu)
>> +{
>> +	int use_kvm;            /* add to global list? */
>> +	struct gfn_list *gfnlist;
>> +	int slot_id = slot->id;
>> +	__u64 offset = gfn - slot->base_gfn;
>> +	__u64 slot_offset;
>> +
>> +	/*
>> +	 * Try to add dirty page to vcpu list.  If vcpu is NULL or
>> +	 * vcpu list is full, then try to add to kvm master list.
>> +	 */
>> +
>> +	if (!kvm->mt.active)
>> +		return;
>> +
>> +	if (slot->id >= KVM_USER_MEM_SLOTS)
>> +		return;
>> +
>> +	if (gfn > kvm->mt.max_gfn)
>> +		return;
>> +
>> +	/* if we're avoiding duplicates, is this one already marked? */
>> +	if (kvm->mt.bmap && test_and_set_bit(gfn, kvm->mt.bmap))
>> +		return;
>> +
>> +	slot_offset = MT_MAKE_SLOT_OFFSET(slot_id, offset);
>> +
>> +	use_kvm = (vcpu == NULL);
>> +
>> +	if (vcpu) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +		if (gfnlist->dirty_index == gfnlist->max_dirty) {
>> +			use_kvm = 1;
>> +			gfnlist->overflow = 1;
>> +			/* Fall back to master gfn list.*/
>> +			gfnlist = &kvm->mt.gfn_list;
>> +		}
>> +	} else {
>> +		gfnlist = &kvm->mt.gfn_list;
>> +	}
>> +
>> +	spin_lock(&gfnlist->lock);
>> +	if (gfnlist->dirty_index >= gfnlist->max_dirty) {
>> +		gfnlist->overflow = 1;
>> +	} else {
>> +		gfnlist->dirty_gfns[gfnlist->dirty_index++] = slot_offset;
>> +		if ((gfnlist->dirty_index % DIRTY_GFN_ADD_GRANULARITY) == 0) {
>> +			spin_lock(&kvm->mt.lock);
>> +			kvm->mt.tot_pages += DIRTY_GFN_ADD_GRANULARITY;
>> +			mt_sw_add_pages(kvm);
>> +			spin_unlock(&kvm->mt.lock);
>> +		}
>> +	}
>> +	spin_unlock(&gfnlist->lock);
>> +}
>> +
>>  void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
>>  {
>>  	struct kvm_memory_slot *memslot;
>>
>>  	memslot = gfn_to_memslot(kvm, gfn);
>> +	if (memslot) {
>> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
>> +			mt_mark_page_dirty(kvm, memslot, gfn, NULL);
>> +	}
>>  	mark_page_dirty_in_slot(memslot, gfn);
>>  }
>>  EXPORT_SYMBOL_GPL(mark_page_dirty);
>> @@ -2010,6 +2104,10 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>>  	struct kvm_memory_slot *memslot;
>>
>>  	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>> +	if (memslot) {
>> +		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
>> +			mt_mark_page_dirty(vcpu->kvm, memslot, gfn, vcpu);
>> +	}
>>  	mark_page_dirty_in_slot(memslot, gfn);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
>> @@ -2823,8 +2921,6 @@ static u64 kvm_get_max_gfn(struct kvm *kvm)
>>  	return num_gfn - 1;
>>  }
>>
>> -#define DIRTY_GFN_ADD_GRANULARITY      (256)
>> -
>>  /*
>>   * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
>>   */
>> @@ -3010,31 +3106,523 @@ static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
>>  		return -EINVAL;
>>  }
>>
>> +static int kvm_enable_mt(struct kvm *kvm)
>> +{
>> +	int rc = 0;
>> +
>> +	if (kvm->mt.active) {
>> +		pr_warn("KVM: vm %d, MT already active\n",
>> +			current->pid);
>> +		rc = -EINVAL;
>> +		goto enable_mt_done;
>> +	}
>> +
>> +	kvm_mmu_mt_enable_log_dirty(kvm);
>> +	if (kvm->mt.bmap)
>> +		memset(kvm->mt.bmap, 0, kvm->mt.bmapsz);
>> +
>> +	kvm->mt.active = 1;
>> +
>> +enable_mt_done:
>> +
>> +	return rc;
>> +}
>> +
>> +static int kvm_disable_mt(struct kvm *kvm)
>> +{
>> +	int rc = 0;
>> +
>> +	if (!kvm->mt.active) {
>> +		pr_warn("KVM: vm %d, MT already disabled\n",
>> +			current->pid);
>> +		rc = -EINVAL;
>> +		goto disable_mt_done;
>> +	}
>> +
>> +	kvm_mmu_mt_disable_log_dirty(kvm);
>> +	kvm->mt.active = 0;
>> +
>> +disable_mt_done:
>> +
>> +	return rc;
>> +}
>> +
>>  static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
>>  {
>> -	return -EINVAL;
>> +	if ((mte->flags & 0x1) == 1)
>> +		return kvm_enable_mt(kvm);
>> +	else if ((mte->flags & 0x1) == 0)
>> +		return kvm_disable_mt(kvm);
>> +	else
>> +		return -EINVAL;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_prepare_cp(struct kvm *kvm,
>>  				      struct mt_prepare_cp *mtpcp)
>>  {
>> -	return -EINVAL;
>> +	int i;
>> +	struct kvm_vcpu *vcpu;
>> +	struct gfn_list *gfnlist;
>> +
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	kvm->mt.cp_id = mtpcp->cpid;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +		spin_lock(&gfnlist->lock);
>> +		gfnlist->fetch_index = 0;
>> +		gfnlist->reset_index = 0;
>> +		gfnlist->dirty_index = 0;
>> +		gfnlist->overflow = 0;
>> +		spin_unlock(&gfnlist->lock);
>> +	}
>> +
>> +	gfnlist = &kvm->mt.gfn_list;
>> +	spin_lock(&gfnlist->lock);
>> +	gfnlist->fetch_index = 0;
>> +	gfnlist->reset_index = 0;
>> +	gfnlist->dirty_index = 0;
>> +	gfnlist->overflow = 0;
>> +	spin_unlock(&gfnlist->lock);
>> +
>> +	kvm->mt.quiesced = 0;
>> +	kvm->mt.allow_blocking = 1;
>> +	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
>> +
>> +	return 0;
>> +}
>> +
>> +static bool mt_reset_gfn(struct kvm *kvm, u64 slot_offset)
>> +{
>> +	gfn_t gfn;
>> +
>> +	gfn = kvm_mt_slot_offset_to_gfn(kvm, slot_offset);
>> +	if (gfn > kvm->mt.max_gfn)
>> +		return 0;
>> +
>> +	if (kvm->mt.bmap) {
>> +		if (kvm->mt.quiesced) {
>> +			/*
>> +			 * Goal is to reset entire bmap, but don't need
>> +			 * atomics if we are quiesced
>> +			 */
>> +			int offset32 = gfn/32;
>> +			int *p = (int *)(kvm->mt.bmap) + offset32;
>> +			*p = 0;
>> +		} else {
>> +			clear_bit(gfn, kvm->mt.bmap);
>> +		}
>> +	}
>> +
>> +	return kvm_mt_mmu_reset_gfn(kvm, slot_offset);
>> +}
>> +
>> +#define GFN_RESET_BATCH        (64)
>> +
>> +static int mt_reset_all_gfns(struct kvm *kvm)
>> +{
>> +	int i, j;
>> +	struct kvm_vcpu *vcpu;
>> +	struct gfn_list *gfnlist;
>> +	bool cleared = false;
>> +	int reset_start, count, avail;
>> +
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	if (!kvm->mt.quiesced)
>> +		return -EINVAL;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +vcpu_gfn_loop:
>> +
>> +		spin_lock(&gfnlist->lock);
>> +		reset_start = gfnlist->reset_index;
>> +		avail = gfnlist->dirty_index - gfnlist->reset_index;
>> +		count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
>> +		gfnlist->reset_index += count;
>> +		spin_unlock(&gfnlist->lock);
>> +
>> +		for (j = reset_start; j < reset_start + count; j++)
>> +			cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
>> +
>> +		if (count)
>> +			goto vcpu_gfn_loop;
>> +	}
>> +
>> +	gfnlist = &kvm->mt.gfn_list;
>> +
>> +global_gfn_loop:
>> +
>> +	spin_lock(&gfnlist->lock);
>> +	reset_start = gfnlist->reset_index;
>> +	avail = gfnlist->dirty_index - gfnlist->reset_index;
>> +	count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
>> +	gfnlist->reset_index += count;
>> +	spin_unlock(&gfnlist->lock);
>> +
>> +	for (j = reset_start; j < reset_start + count; j++)
>> +		cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
>> +
>> +	if (count)
>> +		goto global_gfn_loop;
>> +
>> +	spin_unlock(&kvm->mmu_lock);
>> +
>> +
>> +	if (cleared)
>> +		kvm_flush_remote_tlbs(kvm);
>> +
>> +	return 0;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_rearm_gfns(struct kvm *kvm)
>>  {
>> -	return -EINVAL;
>> +	return mt_reset_all_gfns(kvm);
>> +}
>> +
>> +static int mt_unblock_sw(struct kvm *kvm)
>> +{
>> +	struct sublist_waiter *swp;
>> +
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	kvm->mt.allow_blocking = 0;
>> +
>> +	/* Make sure allow_blocking is clear before the wake up */
>> +	mb();
>> +
>> +	swp = &kvm->mt.sw;
>> +	wake_up(&swp->wq);
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	return 0;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_quiesced(struct kvm *kvm)
>>  {
>> -	return -EINVAL;
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	kvm->mt.quiesced = 1;
>> +
>> +	/* wake up the sublist waiter */
>> +	mt_unblock_sw(kvm);
>> +
>> +	if (kvm->mt.gfn_list.overflow)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +static int mt_sublist_req_nowait(struct kvm *kvm,
>> +				struct mt_sublist_fetch_info *msfi, int offset)
>> +{
>> +	int i, j, avail, goal = msfi->gfn_info.count;
>> +	struct kvm_vcpu *vcpu;
>> +	__u64 *gfndst, *gfnsrc;
>> +	int rc = 0;
>> +	__u64 slot_offset;
>> +	int index;
>> +
>> +	/* Clearing dirty/write bits requires tlb flush before exit */
>> +	int cleared = 0;
>> +
>> +	/* Don't need to lock gfn lists if we're in VM blackout */
>> +	int need_locks = !kvm->mt.quiesced;
>> +
>> +	/* Consolidate flags */
>> +	int reset = msfi->flags & MT_FETCH_REARM;
>> +	int bmap = kvm->mt.bmap != NULL;
>> +
>> +	if (goal == 0)
>> +		return 0;
>> +
>> +	gfndst = &msfi->gfn_info.gfnlist[offset];
>> +	msfi->gfn_info.count = offset;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		int len, rem;
>> +		int vcpu_id;
>> +		struct gfn_list *gfnlist;
>> +
>> +		vcpu_id = vcpu->vcpu_id;
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu_id].gfn_list;
>> +
>> +		mutex_lock(&gfnlist->mtx);
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		avail = gfnlist->dirty_index - gfnlist->fetch_index;
>> +		if (!avail) {
>> +			if (need_locks)
>> +				spin_unlock(&gfnlist->lock);
>> +				mutex_unlock(&gfnlist->mtx);
>> +			continue;
>> +		}
>> +		avail = avail > goal ? goal : avail;
>> +		for (j = 0; j < avail; j++) {
>> +			index = gfnlist->fetch_index+j;
>> +			slot_offset = gfnlist->dirty_gfns[index];
>> +			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
>> +						slot_offset);
>> +		}
>> +		gfnsrc = &kvm->mt.gfn_buf[0];
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +
>> +		rem = copy_to_user(gfndst, gfnsrc,
>> +				avail*sizeof(*gfndst)) / sizeof(*gfndst);
>> +
>> +		/*
>> +		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
>> +		 * below, but must take mmu_lock _before_ gfnlist lock.
>> +		 */
>> +		if (reset)
>> +			spin_lock(&kvm->mmu_lock);
>> +
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		len = avail - rem;
>> +		msfi->gfn_info.count += len;
>> +		gfndst += len;
>> +		if (reset) {
>> +			__u64 gfn;
>> +
>> +			for (j = 0; j < len; j++) {
>> +				index = gfnlist->fetch_index+j;
>> +				slot_offset = gfnlist->dirty_gfns[index];
>> +				gfn = kvm_mt_slot_offset_to_gfn(kvm,
>> +					slot_offset);
>> +				cleared +=
>> +					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
>> +				if (bmap)
>> +					clear_bit(gfn, kvm->mt.bmap);
>> +			}
>> +			gfnlist->reset_index += len;
>> +		}
>> +		gfnlist->fetch_index += len;
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +		if (reset)
>> +			spin_unlock(&kvm->mmu_lock);
>> +		mutex_unlock(&gfnlist->mtx);
>> +
>> +		if (len != avail) {
>> +			rc = -EFAULT;
>> +			goto copy_done_err;
>> +		}
>> +
>> +		goal -= avail;
>> +		if (goal == 0)
>> +			break;
>> +	}
>> +
>> +	/* If we still need more gfns, consult the master list */
>> +	if (goal) {
>> +		int len, rem;
>> +		struct gfn_list *gfnlist = &kvm->mt.gfn_list;
>> +
>> +		mutex_lock(&gfnlist->mtx);
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		avail = gfnlist->dirty_index - gfnlist->fetch_index;
>> +		if (!avail) {
>> +			if (need_locks)
>> +				spin_unlock(&gfnlist->lock);
>> +			mutex_unlock(&gfnlist->mtx);
>> +			goto copy_done_no_err;
>> +		}
>> +		avail = avail > goal ? goal : avail;
>> +		for (j = 0; j < avail; j++) {
>> +			index = gfnlist->fetch_index+j;
>> +			slot_offset = gfnlist->dirty_gfns[index];
>> +			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
>> +						slot_offset);
>> +		}
>> +		gfnsrc = &kvm->mt.gfn_buf[0];
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +
>> +		rem = copy_to_user(gfndst, gfnsrc,
>> +				avail*sizeof(*gfndst)) / sizeof(*gfndst);
>> +
>> +		/*
>> +		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
>> +		 * below, but must take mmu_lock _before_ gfnlist lock.
>> +		 */
>> +		if (reset)
>> +			spin_lock(&kvm->mmu_lock);
>> +
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +
>> +		len = avail - rem;
>> +		msfi->gfn_info.count += len;
>> +		gfnlist->fetch_index += len;
>> +		if (reset) {
>> +			__u64 slot_offset;
>> +			__u64 gfn;
>> +
>> +			for (j = 0; j < len; j++) {
>> +				index = gfnlist->fetch_index+j;
>> +				slot_offset = gfnlist->dirty_gfns[index];
>> +				gfn = kvm_mt_slot_offset_to_gfn(kvm,
>> +					slot_offset);
>> +				cleared +=
>> +					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
>> +				if (bmap)
>> +					clear_bit(gfn, kvm->mt.bmap);
>> +			}
>> +			gfnlist->reset_index += len;
>> +		}
>> +
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +		if (reset)
>> +			spin_unlock(&kvm->mmu_lock);
>> +		mutex_unlock(&gfnlist->mtx);
>> +
>> +		if (len != avail) {
>> +			rc = -EFAULT;
>> +			goto copy_done_err;
>> +		}
>> +
>> +		goal -= avail;
>> +	}
>> +
>> +copy_done_no_err:
>> +
>> +copy_done_err:
>> +
>> +	if (cleared)
>> +		kvm_flush_remote_tlbs(kvm);
>> +
>> +	return rc;
>> +}
>> +
>> +static int mt_sublist_req_wait(struct kvm *kvm,
>> +				struct mt_sublist_fetch_info *msfi)
>> +{
>> +	struct sublist_waiter *swp;
>> +	int goal = msfi->gfn_info.count;
>> +	int offset;
>> +	int rc;
>> +
>> +	if (msfi->gfn_info.count == 0)
>> +		return 0;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +	if (!kvm->mt.allow_blocking) {
>> +		spin_unlock(&kvm->mt.sw_lock);
>> +		return -EINVAL;
>> +	}
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	rc = mt_sublist_req_nowait(kvm, msfi, 0);
>> +	if (rc || (msfi->gfn_info.count == goal))
>> +		return rc;
>> +
>> +	offset = msfi->gfn_info.count;
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	if (kvm->mt.sw_busy) {
>> +		spin_unlock(&kvm->mt.sw_lock);
>> +		return -EBUSY;
>> +	}
>> +	kvm->mt.sw_busy = 1;
>> +
>> +	swp = &kvm->mt.sw;
>> +	swp->goal = goal;
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	rc = wait_event_interruptible(swp->wq,
>> +			!kvm->mt.allow_blocking || !swp->goal);
>> +
>> +	spin_lock(&kvm->mt.sw_lock);
>> +
>> +	kvm->mt.sw_busy = 0;
>> +
>> +	spin_unlock(&kvm->mt.sw_lock);
>> +
>> +	if (rc)
>> +		return rc;
>> +
>> +	msfi->gfn_info.count = goal - offset;
>> +
>> +	return mt_sublist_req_nowait(kvm, msfi, offset);
>> +}
>> +
>> +static int mt_get_dirty_count(struct kvm *kvm,
>> +				struct mt_sublist_fetch_info *msfi)
>> +{
>> +	int i, avail = 0;
>> +	struct kvm_vcpu *vcpu;
>> +	struct gfn_list *gfnlist;
>> +
>> +	/* Don't need to lock gfn lists if we're in VM blackout */
>> +	int need_locks = !kvm->mt.quiesced;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
>> +
>> +		mutex_lock(&gfnlist->mtx);
>> +		if (need_locks)
>> +			spin_lock(&gfnlist->lock);
>> +		avail += gfnlist->dirty_index - gfnlist->fetch_index;
>> +		if (need_locks)
>> +			spin_unlock(&gfnlist->lock);
>> +		mutex_unlock(&gfnlist->mtx);
>> +	}
>> +
>> +	gfnlist = &kvm->mt.gfn_list;
>> +
>> +	mutex_lock(&gfnlist->mtx);
>> +	if (need_locks)
>> +		spin_lock(&gfnlist->lock);
>> +	avail += gfnlist->dirty_index - gfnlist->fetch_index;
>> +	if (need_locks)
>> +		spin_unlock(&gfnlist->lock);
>> +	mutex_unlock(&gfnlist->mtx);
>> +
>> +	msfi->gfn_info.count = avail;
>> +
>> +	return 0;
>>  }
>>
>>  static int kvm_vm_ioctl_mt_sublist_fetch(struct kvm *kvm,
>>  					 struct mt_sublist_fetch_info *mtsfi)
>>  {
>> -	return -EINVAL;
>> +	if (!kvm->mt.active)
>> +		return -EINVAL;
>> +
>> +	if (mtsfi->gfn_info.gfnlist == NULL)
>> +		return mt_get_dirty_count(kvm, mtsfi);
>> +
>> +	if (mtsfi->gfn_info.count == 0)
>> +		return 0;
>> +
>> +	if (!(mtsfi->flags & MT_FETCH_WAIT))
>> +		return mt_sublist_req_nowait(kvm, mtsfi, 0);
>> +
>> +	return mt_sublist_req_wait(kvm, mtsfi);
>>  }
>>
>>  static int kvm_vm_ioctl_mt_dirty_trigger(struct kvm *kvm, int dirty_trigger)
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář April 29, 2016, 6:19 p.m. UTC | #3
2016-04-28 19:58+0000, Cao, Lei:
> On 4/28/2016 5:13 AM, Huang, Kai wrote:
>> One general comment: won't it be better if you devide kvm_mt and make it 
>> embedded to kvm_memory_slot? In my understanding the main difference 
>> between bitmap and your log-dirty mechanism is you are using list not 
>> bitmap, and I think make the dirty_gfn_list embedded to kvm_memory_slot 
>> should simplify lots of your code.
> 
> It's true that one difference of the new mechanism is the use of list 
> instead of bitmap. Another difference is that the dirty list is per
> vcpu, instead of per memory slot. This is so that the list can be updated
> without holding a lock. 

A writeup of alternatives behind your decision would be welcome.

Per-vcpu structures seems crucial with higher amount of VCPUs and
putting per-vcpu into kvm_memory_slots would either take much more
memory, or wouldn't scale as well.

Separating from kvm_memory_slot has its share of issues too, though,
because slot id can stand for different slots during runtime.
(Most/all can be solved by forbidding uses that lead to them, so
 userspace would be to blame if memory tracking worked incorrectly.
 E.g. reconfiguring a slot without pausing vcpus and draining dirty
 pages.)

> It should be noted that what is saved on the dirty list is 
> (mem slot id|offset), not gfn.

Current compaction causes a problem.  Slot id is u32, but gnflist stores
only bottom 16 bits while upper 16 contain important address space.
And 48 bit offset isn't enough for GPA sizes above 60 bits.

Compaction can reserve up to PAGE_SHIFT bits.  KVM must BUILD_BUG_ON()
if KVM_ADDRESS_SPACE_NUM and KVM_MEM_SLOTS_NUM don't fit.  (We'll be
this reworking compaction as soon as per-vcpu address spaces arrive,
because KVM_MEM_SLOTS_NUM already takes 9 bits.)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 2, 2016, 3:24 p.m. UTC | #4
On 4/29/2016 2:19 PM, Radim Kr?má? wrote:
> 2016-04-28 19:58+0000, Cao, Lei:
>> On 4/28/2016 5:13 AM, Huang, Kai wrote:
>>> One general comment: won't it be better if you devide kvm_mt and make it 
>>> embedded to kvm_memory_slot? In my understanding the main difference 
>>> between bitmap and your log-dirty mechanism is you are using list not 
>>> bitmap, and I think make the dirty_gfn_list embedded to kvm_memory_slot 
>>> should simplify lots of your code.
>>
>> It's true that one difference of the new mechanism is the use of list 
>> instead of bitmap. Another difference is that the dirty list is per
>> vcpu, instead of per memory slot. This is so that the list can be updated
>> without holding a lock. 
> 
> A writeup of alternatives behind your decision would be welcome.
> 
> Per-vcpu structures seems crucial with higher amount of VCPUs and
> putting per-vcpu into kvm_memory_slots would either take much more
> memory, or wouldn't scale as well.
> 
> Separating from kvm_memory_slot has its share of issues too, though,
> because slot id can stand for different slots during runtime.
> (Most/all can be solved by forbidding uses that lead to them, so
>  userspace would be to blame if memory tracking worked incorrectly.
>  E.g. reconfiguring a slot without pausing vcpus and draining dirty
>  pages.)
>

Good point. Our per-vcpu dirty list is separate from kvm_memory_slot. I'll
look into the changing slot id issue.
 
>> It should be noted that what is saved on the dirty list is 
>> (mem slot id|offset), not gfn.
> 
> Current compaction causes a problem.  Slot id is u32, but gnflist stores
> only bottom 16 bits while upper 16 contain important address space.
> And 48 bit offset isn't enough for GPA sizes above 60 bits.
> 
> Compaction can reserve up to PAGE_SHIFT bits.  KVM must BUILD_BUG_ON()
> if KVM_ADDRESS_SPACE_NUM and KVM_MEM_SLOTS_NUM don't fit.  (We'll be
> this reworking compaction as soon as per-vcpu address spaces arrive,
> because KVM_MEM_SLOTS_NUM already takes 9 bits.)
> 

memory slot id is currently defined as short. If I understand correctly,
you are saying that it'll become u32 when per-vcpu address space arrives,
right? It's certainly an issue for the current compaction. I'll need to
rework it.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 2, 2016, 3:46 p.m. UTC | #5
2016-05-02 15:24+0000, Cao, Lei:
> On 4/29/2016 2:19 PM, Radim Kr?má? wrote:
> > 2016-04-28 19:58+0000, Cao, Lei:
>>> It should be noted that what is saved on the dirty list is 
>>> (mem slot id|offset), not gfn.
>> 
>> Current compaction causes a problem.  Slot id is u32, but gnflist stores
>> only bottom 16 bits while upper 16 contain important address space.
>> And 48 bit offset isn't enough for GPA sizes above 60 bits.
>> 
>> Compaction can reserve up to PAGE_SHIFT bits.  KVM must BUILD_BUG_ON()
>> if KVM_ADDRESS_SPACE_NUM and KVM_MEM_SLOTS_NUM don't fit.  (We'll be
>> this reworking compaction as soon as per-vcpu address spaces arrive,
>> because KVM_MEM_SLOTS_NUM already takes 9 bits.)
>> 
> 
> memory slot id is currently defined as short. If I understand correctly,
> you are saying that it'll become u32 when per-vcpu address space arrives,
> right? It's certainly an issue for the current compaction. I'll need to
> rework it.

slot_id was always u32.  Before address spaces, it had interval [0,
KVM_MEM_SLOTS_NUM), so u16 was enough.  After address spaces, slot_id
has two components
 as_id:      [0, KVM_ADDRESS_SPACE_NUM)
 as_slot_id: [0, KVM_MEM_SLOTS_NUM)

and slot_id is
  as_id << 16 | as_slot_id

This means you shouldn't look only at bottom 16 bits of slot_id.
'id' in struct kvm_memory_slot is only short, because it doesn't need to
store the address space, but you are interested in a userspace
interface, which uses u32 slot_id to identify a slot.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 2, 2016, 3:51 p.m. UTC | #6
On 5/2/2016 11:46 AM, Radim Kr?má? wrote:
> 2016-05-02 15:24+0000, Cao, Lei:
>> On 4/29/2016 2:19 PM, Radim Kr?má? wrote:
>>> 2016-04-28 19:58+0000, Cao, Lei:
>>>> It should be noted that what is saved on the dirty list is 
>>>> (mem slot id|offset), not gfn.
>>>
>>> Current compaction causes a problem.  Slot id is u32, but gnflist stores
>>> only bottom 16 bits while upper 16 contain important address space.
>>> And 48 bit offset isn't enough for GPA sizes above 60 bits.
>>>
>>> Compaction can reserve up to PAGE_SHIFT bits.  KVM must BUILD_BUG_ON()
>>> if KVM_ADDRESS_SPACE_NUM and KVM_MEM_SLOTS_NUM don't fit.  (We'll be
>>> this reworking compaction as soon as per-vcpu address spaces arrive,
>>> because KVM_MEM_SLOTS_NUM already takes 9 bits.)
>>>
>>
>> memory slot id is currently defined as short. If I understand correctly,
>> you are saying that it'll become u32 when per-vcpu address space arrives,
>> right? It's certainly an issue for the current compaction. I'll need to
>> rework it.
> 
> slot_id was always u32.  Before address spaces, it had interval [0,
> KVM_MEM_SLOTS_NUM), so u16 was enough.  After address spaces, slot_id
> has two components
>  as_id:      [0, KVM_ADDRESS_SPACE_NUM)
>  as_slot_id: [0, KVM_MEM_SLOTS_NUM)
> 
> and slot_id is
>   as_id << 16 | as_slot_id
> 
> This means you shouldn't look only at bottom 16 bits of slot_id.
> 'id' in struct kvm_memory_slot is only short, because it doesn't need to
> store the address space, but you are interested in a userspace
> interface, which uses u32 slot_id to identify a slot.
> 

Got it. Thanks!
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kai Huang May 3, 2016, 6:06 a.m. UTC | #7
On 5/3/2016 3:24 AM, Cao, Lei wrote:
> On 4/29/2016 2:19 PM, Radim Kr?má? wrote:
>> 2016-04-28 19:58+0000, Cao, Lei:
>>> On 4/28/2016 5:13 AM, Huang, Kai wrote:
>>>> One general comment: won't it be better if you devide kvm_mt and make it
>>>> embedded to kvm_memory_slot? In my understanding the main difference
>>>> between bitmap and your log-dirty mechanism is you are using list not
>>>> bitmap, and I think make the dirty_gfn_list embedded to kvm_memory_slot
>>>> should simplify lots of your code.
>>>
>>> It's true that one difference of the new mechanism is the use of list
>>> instead of bitmap. Another difference is that the dirty list is per
>>> vcpu, instead of per memory slot. This is so that the list can be updated
>>> without holding a lock.
>>
>> A writeup of alternatives behind your decision would be welcome.
>>
>> Per-vcpu structures seems crucial with higher amount of VCPUs and
>> putting per-vcpu into kvm_memory_slots would either take much more
>> memory, or wouldn't scale as well.
>>
>> Separating from kvm_memory_slot has its share of issues too, though,
>> because slot id can stand for different slots during runtime.
>> (Most/all can be solved by forbidding uses that lead to them, so
>>  userspace would be to blame if memory tracking worked incorrectly.
>>  E.g. reconfiguring a slot without pausing vcpus and draining dirty
>>  pages.)
>>
>
> Good point. Our per-vcpu dirty list is separate from kvm_memory_slot. I'll
> look into the changing slot id issue.

Actually my concern is, with your new mechanism to track guest dirty 
pages, there will be two logdirty mechanisms (using bitmap and your 
per-vcpu list), which I think is not good as it's a little bit 
redundant, given both mechanisms are used for dirty page logging.

I think your main concern of current bitmap mechanism is scanning bitmap 
takes lots of time, especially when only few pages get dirty, you still 
have to scan the entire bitmap, which results in bad performance if you 
runs checkpoint very frequently. My suggestion is, instead of 
introducing two logdirty data structures, maybe you can try to use 
another more efficient data structure instead of bitmap for both current 
logdirty mechanism and your new interfaces. Maybe Xen's log-dirty tree 
is a good reference.

Of course this is just my concern and I'll leave it to maintainers.

Thanks,
-Kai
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 3, 2016, 2:11 p.m. UTC | #8
2016-05-03 18:06+1200, Huang, Kai:
> Actually my concern is, with your new mechanism to track guest dirty pages,
> there will be two logdirty mechanisms (using bitmap and your per-vcpu list),
> which I think is not good as it's a little bit redundant, given both
> mechanisms are used for dirty page logging.
> 
> I think your main concern of current bitmap mechanism is scanning bitmap
> takes lots of time, especially when only few pages get dirty, you still have
> to scan the entire bitmap, which results in bad performance if you runs
> checkpoint very frequently. My suggestion is, instead of introducing two
> logdirty data structures, maybe you can try to use another more efficient
> data structure instead of bitmap for both current logdirty mechanism and
> your new interfaces. Maybe Xen's log-dirty tree is a good reference.

A sparse structure (buffer, tree, ...) also needs a mechanism to grow
(store new entries), so concurrent accesses become a problem, because
there has to be synchronization.  I think that per-vcpu structure
becomes mandatory when thousands VCPUs dirty memory at the same time.

>                      Maybe Xen's log-dirty tree is a good reference.

Is there some top-level overview?

From a glance at the code, it looked like GPA bitmap sparsified with
radix tree in a manner similar to the page table hierarchy.

> Of course this is just my concern and I'll leave it to maintainers.

I too would prefer if both userspace interfaces used a common backend.
A possible backend for that is

  vcpu -> memslot -> sparse dirty log

We should have dynamic sparse dirty log, to avoid wasting memory when
there are many small memslots, but a linear structure is probably still
fine.

We don't care which vcpu dirtied the page, so it seems like a waste to
have them in the hierarchy, but I can't think of designs where the
sparse dirty log is rooted in memslot and its updates scale well.

'memslot -> sparse dirty log' usually evolve into buffering on the VCPU
side before writing to the memslot or aren't efficient for sparse
dataset.

Where do you think is the balance between 'memslot -> bitmap' and
'vcpu -> memslot -> dirty buffer'?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kai Huang May 4, 2016, 7:45 a.m. UTC | #9
On 5/4/2016 2:11 AM, Radim Kr?má? wrote:
> 2016-05-03 18:06+1200, Huang, Kai:
>> Actually my concern is, with your new mechanism to track guest dirty pages,
>> there will be two logdirty mechanisms (using bitmap and your per-vcpu list),
>> which I think is not good as it's a little bit redundant, given both
>> mechanisms are used for dirty page logging.
>>
>> I think your main concern of current bitmap mechanism is scanning bitmap
>> takes lots of time, especially when only few pages get dirty, you still have
>> to scan the entire bitmap, which results in bad performance if you runs
>> checkpoint very frequently. My suggestion is, instead of introducing two
>> logdirty data structures, maybe you can try to use another more efficient
>> data structure instead of bitmap for both current logdirty mechanism and
>> your new interfaces. Maybe Xen's log-dirty tree is a good reference.
>
> A sparse structure (buffer, tree, ...) also needs a mechanism to grow
> (store new entries), so concurrent accesses become a problem, because
> there has to be synchronization.  I think that per-vcpu structure
> becomes mandatory when thousands VCPUs dirty memory at the same time.

Yes synchronization will be needed. But even for per-vcpu structure, we 
still need per-vcpu lock to access, say, gfn_list, right? For example, 
one thread from userspace trying to get and clear dirty pages would need 
to loop all vcpus and acquire each vcpu's lock for gfn_list. (see 
function mt_reset_all_gfns in patch 3/6). Looks this is not scalable 
neither?

>
>>                      Maybe Xen's log-dirty tree is a good reference.
>
> Is there some top-level overview?
>
>>From a glance at the code, it looked like GPA bitmap sparsified with
> radix tree in a manner similar to the page table hierarchy.

Yes it is just a radix tree. The point is the tree will be pretty small 
if there are few dirty pages, so the scanning will be very quick, 
comparing to bitmap.

>
>> Of course this is just my concern and I'll leave it to maintainers.
>
> I too would prefer if both userspace interfaces used a common backend.
> A possible backend for that is
>
>   vcpu -> memslot -> sparse dirty log

This is the most reasonable proposal I think, at least for the first 
step to improve performance.

>
> We should have dynamic sparse dirty log, to avoid wasting memory when
> there are many small memslots, but a linear structure is probably still
> fine.

The sparse dirty log structure can be allocated when necessary so I 
don't think it will waste of memory. Take radix tree as example, if 
there's no dirty page in the slot, the pointer to radix can be NULL, or 
just root entry.

>
> We don't care which vcpu dirtied the page, so it seems like a waste to
> have them in the hierarchy, but I can't think of designs where the
> sparse dirty log is rooted in memslot and its updates scale well.
>
> 'memslot -> sparse dirty log' usually evolve into buffering on the VCPU
> side before writing to the memslot or aren't efficient for sparse
> dataset.
>
> Where do you think is the balance between 'memslot -> bitmap' and
> 'vcpu -> memslot -> dirty buffer'?

In my opinion, we can first try 'memslot -> sparse dirty log'. Cao, Lei 
mentioned there were two bottlenecks: bitmap and bad multithread 
performance due to mmu_lock. I think 'memslot->sparse dirty log' might 
help to improve or solve the bitmap one.

>
> Thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 4, 2016, 1:13 p.m. UTC | #10
2016-05-04 19:45+1200, Huang, Kai:
> On 5/4/2016 2:11 AM, Radim Kr?má? wrote:
>> 2016-05-03 18:06+1200, Huang, Kai:
>> > Actually my concern is, with your new mechanism to track guest dirty pages,
>> > there will be two logdirty mechanisms (using bitmap and your per-vcpu list),
>> > which I think is not good as it's a little bit redundant, given both
>> > mechanisms are used for dirty page logging.
>> > 
>> > I think your main concern of current bitmap mechanism is scanning bitmap
>> > takes lots of time, especially when only few pages get dirty, you still have
>> > to scan the entire bitmap, which results in bad performance if you runs
>> > checkpoint very frequently. My suggestion is, instead of introducing two
>> > logdirty data structures, maybe you can try to use another more efficient
>> > data structure instead of bitmap for both current logdirty mechanism and
>> > your new interfaces. Maybe Xen's log-dirty tree is a good reference.
>> 
>> A sparse structure (buffer, tree, ...) also needs a mechanism to grow
>> (store new entries), so concurrent accesses become a problem, because
>> there has to be synchronization.  I think that per-vcpu structure
>> becomes mandatory when thousands VCPUs dirty memory at the same time.
> 
> Yes synchronization will be needed. But even for per-vcpu structure, we
> still need per-vcpu lock to access, say, gfn_list, right? For example, one
> thread from userspace trying to get and clear dirty pages would need to loop
> all vcpus and acquire each vcpu's lock for gfn_list. (see function
> mt_reset_all_gfns in patch 3/6). Looks this is not scalable neither?

Coarse locking is optional.  The list can be designed allow concurrent
addition and removal (circullar buffer with 3 atomic markers).

If we had 'vcpu -> memslot -> structure' then we would design the
userspace interface so it would only affect one memslot, which would
avoid any scalability issue even if there was a vcpu+memslot lock in
each structure.

>> >                      Maybe Xen's log-dirty tree is a good reference.
>> 
>> Is there some top-level overview?
>> 
>> > From a glance at the code, it looked like GPA bitmap sparsified with
>> radix tree in a manner similar to the page table hierarchy.
> 
> Yes it is just a radix tree. The point is the tree will be pretty small if
> there are few dirty pages, so the scanning will be very quick, comparing to
> bitmap.

Bitmap had slow scanning, but any dynamic structure will have problems
with insertion ...

I think the tree might work if we pre-allotected bigger chunks to avoid
allocation overhead and made it "lockless" (fine grained locking using
cmpxchg) to avoid a bottleneck for concurrent writes.

>> We should have dynamic sparse dirty log, to avoid wasting memory when
>> there are many small memslots, but a linear structure is probably still
>> fine.
> 
> The sparse dirty log structure can be allocated when necessary so I don't
> think it will waste of memory. Take radix tree as example, if there's no
> dirty page in the slot, the pointer to radix can be NULL, or just root
> entry.

(And we want to waste some memory, because allocations are slow,
 tradeoffs, tradeoffs ...)

>> We don't care which vcpu dirtied the page, so it seems like a waste to
>> have them in the hierarchy, but I can't think of designs where the
>> sparse dirty log is rooted in memslot and its updates scale well.
>> 
>> 'memslot -> sparse dirty log' usually evolve into buffering on the VCPU
>> side before writing to the memslot or aren't efficient for sparse
>> dataset.
>> 
>> Where do you think is the balance between 'memslot -> bitmap' and
>> 'vcpu -> memslot -> dirty buffer'?
> 
> In my opinion, we can first try 'memslot -> sparse dirty log'. Cao, Lei
> mentioned there were two bottlenecks: bitmap and bad multithread performance
> due to mmu_lock. I think 'memslot->sparse dirty log' might help to improve
> or solve the bitmap one.

The bimap was chosen because it scales well with concurrent writes and
it easy to export.  Lei already hit scalability issues with mmu_lock, so
I don't expect that we could afford to put all VCPUs onto one lock
elsewhere.

Good designs so far seem to be:
 memslot -> lockless radix tree
and
 vcpu -> memslot -> list  (memslot -> vcpu -> list)

I'd like to see the lockless radix tree, but I expect the per-vcpu list
to be *much* easier to implment.

Do you see other designs on the pareto front?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 4, 2016, 1:51 p.m. UTC | #11
On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
> 2016-05-04 19:45+1200, Huang, Kai:
>> On 5/4/2016 2:11 AM, Radim Kr?má? wrote:
>>> 2016-05-03 18:06+1200, Huang, Kai:
>>>> Actually my concern is, with your new mechanism to track guest dirty pages,
>>>> there will be two logdirty mechanisms (using bitmap and your per-vcpu list),
>>>> which I think is not good as it's a little bit redundant, given both
>>>> mechanisms are used for dirty page logging.
>>>>
>>>> I think your main concern of current bitmap mechanism is scanning bitmap
>>>> takes lots of time, especially when only few pages get dirty, you still have
>>>> to scan the entire bitmap, which results in bad performance if you runs
>>>> checkpoint very frequently. My suggestion is, instead of introducing two
>>>> logdirty data structures, maybe you can try to use another more efficient
>>>> data structure instead of bitmap for both current logdirty mechanism and
>>>> your new interfaces. Maybe Xen's log-dirty tree is a good reference.
>>>
>>> A sparse structure (buffer, tree, ...) also needs a mechanism to grow
>>> (store new entries), so concurrent accesses become a problem, because
>>> there has to be synchronization.  I think that per-vcpu structure
>>> becomes mandatory when thousands VCPUs dirty memory at the same time.
>>
>> Yes synchronization will be needed. But even for per-vcpu structure, we
>> still need per-vcpu lock to access, say, gfn_list, right? For example, one
>> thread from userspace trying to get and clear dirty pages would need to loop
>> all vcpus and acquire each vcpu's lock for gfn_list. (see function
>> mt_reset_all_gfns in patch 3/6). Looks this is not scalable neither?
> 
> Coarse locking is optional.  The list can be designed allow concurrent
> addition and removal (circullar buffer with 3 atomic markers).
> 

Agreed. 

Kai, the locking in mt_reset_all_gfns is read lock on mmu_lock, so there is no 
synchronization among userspace threads when clearing dirty pages. We changed it
from spin_lock to avoid the mmu_lock bottleneck.

> If we had 'vcpu -> memslot -> structure' then we would design the
> userspace interface so it would only affect one memslot, which would
> avoid any scalability issue even if there was a vcpu+memslot lock in
> each structure.
> 
>>>>                      Maybe Xen's log-dirty tree is a good reference.
>>>
>>> Is there some top-level overview?
>>>
>>>> From a glance at the code, it looked like GPA bitmap sparsified with
>>> radix tree in a manner similar to the page table hierarchy.
>>
>> Yes it is just a radix tree. The point is the tree will be pretty small if
>> there are few dirty pages, so the scanning will be very quick, comparing to
>> bitmap.
> 
> Bitmap had slow scanning, but any dynamic structure will have problems
> with insertion ...
> 
> I think the tree might work if we pre-allotected bigger chunks to avoid
> allocation overhead and made it "lockless" (fine grained locking using
> cmpxchg) to avoid a bottleneck for concurrent writes.
> 
>>> We should have dynamic sparse dirty log, to avoid wasting memory when
>>> there are many small memslots, but a linear structure is probably still
>>> fine.
>>
>> The sparse dirty log structure can be allocated when necessary so I don't
>> think it will waste of memory. Take radix tree as example, if there's no
>> dirty page in the slot, the pointer to radix can be NULL, or just root
>> entry.
> 
> (And we want to waste some memory, because allocations are slow,
>  tradeoffs, tradeoffs ...)
> 

Right, we want to avoid dynamic memory allocation/free during checkpoint cycles,
which is performance critical.

>>> We don't care which vcpu dirtied the page, so it seems like a waste to
>>> have them in the hierarchy, but I can't think of designs where the
>>> sparse dirty log is rooted in memslot and its updates scale well.
>>>
>>> 'memslot -> sparse dirty log' usually evolve into buffering on the VCPU
>>> side before writing to the memslot or aren't efficient for sparse
>>> dataset.
>>>
>>> Where do you think is the balance between 'memslot -> bitmap' and
>>> 'vcpu -> memslot -> dirty buffer'?
>>
>> In my opinion, we can first try 'memslot -> sparse dirty log'. Cao, Lei
>> mentioned there were two bottlenecks: bitmap and bad multithread performance
>> due to mmu_lock. I think 'memslot->sparse dirty log' might help to improve
>> or solve the bitmap one.
> 
> The bimap was chosen because it scales well with concurrent writes and
> it easy to export.  Lei already hit scalability issues with mmu_lock, so
> I don't expect that we could afford to put all VCPUs onto one lock
> elsewhere.
> 
> Good designs so far seem to be:
>  memslot -> lockless radix tree
> and
>  vcpu -> memslot -> list  (memslot -> vcpu -> list)
> 
> I'd like to see the lockless radix tree, but I expect the per-vcpu list
> to be *much* easier to implment.
> 
> Do you see other designs on the pareto front?
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 4, 2016, 5:15 p.m. UTC | #12
On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
> 2016-05-04 19:45+1200, Huang, Kai:
>> On 5/4/2016 2:11 AM, Radim Kr?má? wrote:
>>> 2016-05-03 18:06+1200, Huang, Kai:
>>>> Actually my concern is, with your new mechanism to track guest dirty pages,
>>>> there will be two logdirty mechanisms (using bitmap and your per-vcpu list),
>>>> which I think is not good as it's a little bit redundant, given both
>>>> mechanisms are used for dirty page logging.
>>>>
>>>> I think your main concern of current bitmap mechanism is scanning bitmap
>>>> takes lots of time, especially when only few pages get dirty, you still have
>>>> to scan the entire bitmap, which results in bad performance if you runs
>>>> checkpoint very frequently. My suggestion is, instead of introducing two
>>>> logdirty data structures, maybe you can try to use another more efficient
>>>> data structure instead of bitmap for both current logdirty mechanism and
>>>> your new interfaces. Maybe Xen's log-dirty tree is a good reference.
>>>
>>> A sparse structure (buffer, tree, ...) also needs a mechanism to grow
>>> (store new entries), so concurrent accesses become a problem, because
>>> there has to be synchronization.  I think that per-vcpu structure
>>> becomes mandatory when thousands VCPUs dirty memory at the same time.
>>
>> Yes synchronization will be needed. But even for per-vcpu structure, we
>> still need per-vcpu lock to access, say, gfn_list, right? For example, one
>> thread from userspace trying to get and clear dirty pages would need to loop
>> all vcpus and acquire each vcpu's lock for gfn_list. (see function
>> mt_reset_all_gfns in patch 3/6). Looks this is not scalable neither?
> 
> Coarse locking is optional.  The list can be designed allow concurrent
> addition and removal (circullar buffer with 3 atomic markers).
> 
> If we had 'vcpu -> memslot -> structure' then we would design the
> userspace interface so it would only affect one memslot, which would
> avoid any scalability issue even if there was a vcpu+memslot lock in
> each structure.
> 
>>>>                      Maybe Xen's log-dirty tree is a good reference.
>>>
>>> Is there some top-level overview?
>>>
>>>> From a glance at the code, it looked like GPA bitmap sparsified with
>>> radix tree in a manner similar to the page table hierarchy.
>>
>> Yes it is just a radix tree. The point is the tree will be pretty small if
>> there are few dirty pages, so the scanning will be very quick, comparing to
>> bitmap.
> 
> Bitmap had slow scanning, but any dynamic structure will have problems
> with insertion ...
> 
> I think the tree might work if we pre-allotected bigger chunks to avoid
> allocation overhead and made it "lockless" (fine grained locking using
> cmpxchg) to avoid a bottleneck for concurrent writes.
> 
>>> We should have dynamic sparse dirty log, to avoid wasting memory when
>>> there are many small memslots, but a linear structure is probably still
>>> fine.
>>
>> The sparse dirty log structure can be allocated when necessary so I don't
>> think it will waste of memory. Take radix tree as example, if there's no
>> dirty page in the slot, the pointer to radix can be NULL, or just root
>> entry.
> 
> (And we want to waste some memory, because allocations are slow,
>  tradeoffs, tradeoffs ...)
> 
>>> We don't care which vcpu dirtied the page, so it seems like a waste to
>>> have them in the hierarchy, but I can't think of designs where the
>>> sparse dirty log is rooted in memslot and its updates scale well.
>>>
>>> 'memslot -> sparse dirty log' usually evolve into buffering on the VCPU
>>> side before writing to the memslot or aren't efficient for sparse
>>> dataset.
>>>
>>> Where do you think is the balance between 'memslot -> bitmap' and
>>> 'vcpu -> memslot -> dirty buffer'?
>>
>> In my opinion, we can first try 'memslot -> sparse dirty log'. Cao, Lei
>> mentioned there were two bottlenecks: bitmap and bad multithread performance
>> due to mmu_lock. I think 'memslot->sparse dirty log' might help to improve
>> or solve the bitmap one.
> 
> The bimap was chosen because it scales well with concurrent writes and
> it easy to export.  Lei already hit scalability issues with mmu_lock, so
> I don't expect that we could afford to put all VCPUs onto one lock
> elsewhere.
> 
> Good designs so far seem to be:
>  memslot -> lockless radix tree
> and
>  vcpu -> memslot -> list  (memslot -> vcpu -> list)
>

There is no need for lookup, the dirty log is fetched in sequence, so why use
radix tree with added complexity but no benefit?

List can be designed to be lockless, so memslot -> lockless fixed list?
 
> I'd like to see the lockless radix tree, but I expect the per-vcpu list
> to be *much* easier to implment.
> 
> Do you see other designs on the pareto front?
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 4, 2016, 6:33 p.m. UTC | #13
On 5/4/2016 1:15 PM, Cao, Lei wrote:
> On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>> 2016-05-04 19:45+1200, Huang, Kai:
>>> On 5/4/2016 2:11 AM, Radim Kr?má? wrote:
>>>> 2016-05-03 18:06+1200, Huang, Kai:
>>>>> Actually my concern is, with your new mechanism to track guest dirty pages,
>>>>> there will be two logdirty mechanisms (using bitmap and your per-vcpu list),
>>>>> which I think is not good as it's a little bit redundant, given both
>>>>> mechanisms are used for dirty page logging.
>>>>>
>>>>> I think your main concern of current bitmap mechanism is scanning bitmap
>>>>> takes lots of time, especially when only few pages get dirty, you still have
>>>>> to scan the entire bitmap, which results in bad performance if you runs
>>>>> checkpoint very frequently. My suggestion is, instead of introducing two
>>>>> logdirty data structures, maybe you can try to use another more efficient
>>>>> data structure instead of bitmap for both current logdirty mechanism and
>>>>> your new interfaces. Maybe Xen's log-dirty tree is a good reference.
>>>>
>>>> A sparse structure (buffer, tree, ...) also needs a mechanism to grow
>>>> (store new entries), so concurrent accesses become a problem, because
>>>> there has to be synchronization.  I think that per-vcpu structure
>>>> becomes mandatory when thousands VCPUs dirty memory at the same time.
>>>
>>> Yes synchronization will be needed. But even for per-vcpu structure, we
>>> still need per-vcpu lock to access, say, gfn_list, right? For example, one
>>> thread from userspace trying to get and clear dirty pages would need to loop
>>> all vcpus and acquire each vcpu's lock for gfn_list. (see function
>>> mt_reset_all_gfns in patch 3/6). Looks this is not scalable neither?
>>
>> Coarse locking is optional.  The list can be designed allow concurrent
>> addition and removal (circullar buffer with 3 atomic markers).
>>
>> If we had 'vcpu -> memslot -> structure' then we would design the
>> userspace interface so it would only affect one memslot, which would
>> avoid any scalability issue even if there was a vcpu+memslot lock in
>> each structure.
>>
>>>>>                      Maybe Xen's log-dirty tree is a good reference.
>>>>
>>>> Is there some top-level overview?
>>>>
>>>>> From a glance at the code, it looked like GPA bitmap sparsified with
>>>> radix tree in a manner similar to the page table hierarchy.
>>>
>>> Yes it is just a radix tree. The point is the tree will be pretty small if
>>> there are few dirty pages, so the scanning will be very quick, comparing to
>>> bitmap.
>>
>> Bitmap had slow scanning, but any dynamic structure will have problems
>> with insertion ...
>>
>> I think the tree might work if we pre-allotected bigger chunks to avoid
>> allocation overhead and made it "lockless" (fine grained locking using
>> cmpxchg) to avoid a bottleneck for concurrent writes.
>>
>>>> We should have dynamic sparse dirty log, to avoid wasting memory when
>>>> there are many small memslots, but a linear structure is probably still
>>>> fine.
>>>
>>> The sparse dirty log structure can be allocated when necessary so I don't
>>> think it will waste of memory. Take radix tree as example, if there's no
>>> dirty page in the slot, the pointer to radix can be NULL, or just root
>>> entry.
>>
>> (And we want to waste some memory, because allocations are slow,
>>  tradeoffs, tradeoffs ...)
>>
>>>> We don't care which vcpu dirtied the page, so it seems like a waste to
>>>> have them in the hierarchy, but I can't think of designs where the
>>>> sparse dirty log is rooted in memslot and its updates scale well.
>>>>
>>>> 'memslot -> sparse dirty log' usually evolve into buffering on the VCPU
>>>> side before writing to the memslot or aren't efficient for sparse
>>>> dataset.
>>>>
>>>> Where do you think is the balance between 'memslot -> bitmap' and
>>>> 'vcpu -> memslot -> dirty buffer'?
>>>
>>> In my opinion, we can first try 'memslot -> sparse dirty log'. Cao, Lei
>>> mentioned there were two bottlenecks: bitmap and bad multithread performance
>>> due to mmu_lock. I think 'memslot->sparse dirty log' might help to improve
>>> or solve the bitmap one.
>>
>> The bimap was chosen because it scales well with concurrent writes and
>> it easy to export.  Lei already hit scalability issues with mmu_lock, so
>> I don't expect that we could afford to put all VCPUs onto one lock
>> elsewhere.
>>
>> Good designs so far seem to be:
>>  memslot -> lockless radix tree
>> and
>>  vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>
> 
> There is no need for lookup, the dirty log is fetched in sequence, so why use
> radix tree with added complexity but no benefit?
> 
> List can be designed to be lockless, so memslot -> lockless fixed list?

Never mind, lookup is needed to avoid duplicates. We can use list+bitmap, but
it's obviously not as efficient as radix tree.

>  
>> I'd like to see the lockless radix tree, but I expect the per-vcpu list
>> to be *much* easier to implment.
>>
>> Do you see other designs on the pareto front?
>>
>> Thanks.
>>
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 4, 2016, 6:57 p.m. UTC | #14
2016-05-04 18:33+0000, Cao, Lei:
> On 5/4/2016 1:15 PM, Cao, Lei wrote:
>> On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>>> Good designs so far seem to be:
>>>  memslot -> lockless radix tree
>>> and
>>>  vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>
>> 
>> There is no need for lookup, the dirty log is fetched in sequence, so why use
>> radix tree with added complexity but no benefit?
>> 
>> List can be designed to be lockless, so memslot -> lockless fixed list?
> 
> Never mind, lookup is needed to avoid duplicates. We can use list+bitmap, but
> it's obviously not as efficient as radix tree.

Are duplicates a significant problem?

(The dirtied page is marked as dirty, so we should have zero to very few
 duplicates, depending on how dirtying and vm-exit on write to clean page
 cooperate.  Duplicates don't introduce any bugs and we could also check
 last few entries in the list to weed out most likely cases.)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 4, 2016, 7:27 p.m. UTC | #15
2016-05-04 17:15+0000, Cao, Lei:
> On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>> Good designs so far seem to be:
>>  memslot -> lockless radix tree
>> and
>>  vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>
> 
> There is no need for lookup, the dirty log is fetched in sequence, so why use
> radix tree with added complexity but no benefit?
> 
> List can be designed to be lockless, so memslot -> lockless fixed list?

It can, but lockless list for concurrent writers is harder than lockless
list for a concurrent writer and reader.
The difference is in starvation -- it's possible that VCPU would never
get to write an entry unless you implemented a queueing mechanism.
A queueing mechanism means that you basically have a spinlock, so I
wouldn't bother with a lockless list and just try spinlock directly.

A spinlock with very short critical section might actually work well for
< 256 VCPU and is definitely the easiest option.  Worth experimenting
with, IMO.

Lockless radix tree doesn't starve.  Every entry has a well defined
place in the tree.  The entry just might not be fully allocated yet.
If another VCPU is faster and expands the tree, then other VCPUs use
that extended tree until they all get to their leaf nodes, VCPUs
basically cooperate on growing the tree.

And I completely forgot that we can preallocate the whole tree and use a
effective packed storage thanks to that.  My first guess is that it
would be make sense with double the memory of our bitmap.  Scans and
insertion would be slower than for a per-vcpu list, but much faster than
with a dynamically allocated structure.  I'll think a bit about that.

The main reason why I'd like something that can contain all dirty pages
is overflow -- the userspace has to treat *all* pages as dirty if we
lose a dirty page, so overflow must never happen -- we have to either
grow the dirty log or suspend the writer until userspace frees space ...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 5, 2016, 4:26 p.m. UTC | #16
2016-05-04 21:27+0200, Radim Kr?má?:
> And I completely forgot that we can preallocate the whole tree and use a
> effective packed storage thanks to that.  My first guess is that it
> would be make sense with double the memory of our bitmap.  Scans and
> insertion would be slower than for a per-vcpu list, but much faster than
> with a dynamically allocated structure.  I'll think a bit about that.

The preallocate allocated radix tree has
 * fast insertion (few atomic bit writes at places that are computed
   only with arithmetics = no pointer traversal)
 * feasible lockless removal (not sure how fast it will be, but it only
   slows down the read side, and if we can guarantee that all vcpus are
   stopped, then it's trivial)
 * compatible with bitmap (a subset of the tree is 1:1 legacy bitmap)
 * very fast search if pages are allocated in the same range (on par
   with list)
 * only ~2 times slower search that bitmap when the bitmap is full
 * high memory consumption (roughly double the bitmap)

The only reason why a dynamically allocated tree would be used instead
is unacceptable memory consumption, but we already lived with a bitmap,
so I think it is worth a try.

I wrote a dirty, buggy an unoptimized userspace implementation.  (Code
attached if you're interested.) Especially the bitmap is crude, so real
numbers will look differently, but asymptotic behaviour should remain.

If we consider a slot with 10000000 pages, which is 38 GB of tracked
memory, and datasets from 100 to 10000000 random dirty frames, then
timing for codes that compact [l]ist, [t]ree and [b]itmap into a dirty
list, are:

  % ./radix_tree 10000000 100
  kB size l:0 t:1513 b:1220
  l:        642 ns (1.00,  78.05, 45977.38)
  t:      50108 ns (0.01,   1.00,   589.08)
  b:   29517475 ns (0.00,   0.00,     1.00)

  % ./radix_tree 10000000 1000
  l:       9542 ns (1.00,  43.29,  3347.11)
  t:     413110 ns (0.02,   1.00,    77.31)
  b:   31938143 ns (0.00,   0.01,     1.00)

  % ./radix_tree 10000000 10000
  l:     105859 ns (1.00,  21.07,   273.75)
  t:    2230429 ns (0.05,   1.00,    12.99)
  b:   28978510 ns (0.00,   0.08,     1.00)

  % ./radix_tree 10000000 100000
  l:     984729 ns (1.00,  12.16,    25.53)
  t:   11970068 ns (0.08,   1.00,     2.10)
  b:   25144204 ns (0.04,   0.48,     1.00)

  % ./radix_tree 10000000 1000000
  kB size l:7812 t:1513 b:1220
  l:    6420041 ns (1.00,   5.69,     4.13)
  t:   36511925 ns (0.18,   1.00,     0.73)
  b:   26523022 ns (0.24,   1.38,     1.00)

  % ./radix_tree 10000000 10000000
  kB size l:78125 t:1513 b:1220
  l:   38180627 ns (1.00,   1.56,     1.31)
  t:   59681915 ns (0.64,   1.00,     0.84)
  b:   50079550 ns (0.76,   1.19,     1.00)

The tree is 589.08 times faster than bitmap with 100 "dirty pages" and
list is 78.05 times faster than the tree.  Speedups get smaller as the
increases and the tree ends up performing worse than a bitmap.

The tree performs much better when the dataset isn't random, i.e. dirty
frames are a seqence from 0 to the size of the dataset:

  % ./radix_tree 10000000 1000 seq
  l:       8928 ns (1.00,   1.93,  3366.93)
  t:      17198 ns (0.52,   1.00,  1747.88)
  b:   30059972 ns (0.00,   0.00,     1.00)

  % ./radix_tree 10000000 10000 seq
  l:      96469 ns (1.00,   1.36,   271.40)
  t:     131124 ns (0.74,   1.00,   199.67)
  b:   26181839 ns (0.00,   0.01,     1.00)

  % ./radix_tree 10000000 100000 seq
  l:    1256119 ns (1.00,   1.28,    24.71)
  t:    1609232 ns (0.78,   1.00,    19.29)
  b:   31036003 ns (0.04,   0.05,     1.00)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <inttypes.h>
#include <time.h>

typedef uint8_t   u8;
typedef uint64_t  u64;
typedef int16_t   s16;

const u8 radix_shift = 3;
const u8 radix = 1 << 3;

struct packed_radix_8_tree {
	u64 bitmap_offset;
	u64 bitmap_size;
	u8 packed[];
};

static inline void set_bit(u8 *data, u8 bit)
{
	*data |= 1 << bit; /* not atomic */
}

static inline u8 find_next_bit(u8 *data, u8 size, u8 bit)
{
	while ((bit < size) && !(*data & (1 << bit)))
		bit++;
	return bit;
}

static inline u8 find_first_bit(u8 *data, u8 size)
{
	return find_next_bit(data, size, 0);
}

static inline u64 child_node(u8 radix_shift, u64 node, u8 child)
{
	return (node * radix) + 1 + child;
}

/* cn = child(radix, node, ci);
 * n = cn;
 * i = parent_node(radix, &n);
 * i == ci && n == node;
 */
static inline u8 parent_node(u8 radix, u64 *node)
{
	u64 child_mask = radix - 1;
	u8 child = (*node - 1) & child_mask;

	*node = (*node - 1) / radix;

	return child;
}

u64 packed_radix_8_tree_encode_leaf(struct packed_radix_8_tree *tree, u64 value)
{
	return value + tree->bitmap_offset + 1;
}

u64 packed_radix_8_tree_decode_leaf(struct packed_radix_8_tree *tree, u64 leaf)
{
	return leaf - tree->bitmap_offset - 1;
}

void packed_radix_8_tree_insert(struct packed_radix_8_tree *tree, u64 value)
{
	u64 node = packed_radix_8_tree_encode_leaf(tree, value);

	do {
		u8 child = parent_node(radix, &node);
		set_bit(&tree->packed[node], child);
	} while (node);
}

u64 __packed_radix_8_tree_find_first_leaf(struct packed_radix_8_tree *tree, u64 node)
{
	u8 child;

	while (node < tree->bitmap_offset) {
		child = find_first_bit(&tree->packed[node], radix);

		if (child == radix)
			return -1ULL;

		node = child_node(radix, node, child);
	}

	return node;
}

u64 packed_radix_8_tree_find_first_leaf(struct packed_radix_8_tree *tree)
{
	return __packed_radix_8_tree_find_first_leaf(tree, 0);
}

u64 packed_radix_8_tree_find_next_leaf(struct packed_radix_8_tree *tree, u64 leaf)
{
	u64 node = leaf;
	u8 child;

	do {
		child = parent_node(radix, &node);
		child = find_next_bit(&tree->packed[node], radix, child + 1);
	} while (node && child == radix);

	if (child == radix)
		return -1ULL;

	return __packed_radix_8_tree_find_first_leaf(tree, child_node(radix, node, child));
}

struct packed_radix_8_tree * packed_radix_8_tree_alloc(u64 size)
{
	struct packed_radix_8_tree *tree;
	u64 bitmap_size = radix;
	u64 tree_size = 0;

	while (bitmap_size < size) {
		tree_size += bitmap_size;
		bitmap_size *= radix;
	}

	tree = malloc(sizeof(*tree) + (size + tree_size) / 8);
	memset(tree->packed, 0, (size + tree_size) / 8);
	tree->bitmap_offset = tree_size;
	tree->bitmap_size = size;

	return tree;
}

#define for_each_packed_radix_8_tree_leaf(tree, leaf) \
	for (leaf = packed_radix_8_tree_find_first_leaf(tree); \
	     leaf != -1ULL; \
	     leaf = packed_radix_8_tree_find_next_leaf(tree, leaf))

u64 bitmap_find_next_bit(u8 *bitmap, u64 size, u64 bit)
{
	u8 offset;
	while (bit < size) {
		for (offset = bit % 8; offset < 8 && !(bitmap[bit/8] & (1 << offset)); bit++, offset++);
		if (offset < 8)
			break;
	}
	return bit;
}

#define for_each_packed_radix_8_tree_bit(tree, bit) \
	for (bit = bitmap_find_next_bit(&tree->packed[tree->bitmap_offset / radix], tree->bitmap_size, 0); \
	     bit < tree->bitmap_size; \
	     bit = bitmap_find_next_bit(&tree->packed[tree->bitmap_offset / radix], tree->bitmap_size, bit + 1))

u64 time_tree(struct packed_radix_8_tree *tree, u64 *list)
{
	struct timespec begin, end;
	u64 leaf, count = 0;

	clock_gettime(CLOCK_MONOTONIC, &begin);
	for_each_packed_radix_8_tree_leaf(tree, leaf)
		list[count++] = packed_radix_8_tree_decode_leaf(tree, leaf);
	clock_gettime(CLOCK_MONOTONIC, &end);

	return (end.tv_sec - begin.tv_sec)*1000000000 + end.tv_nsec - begin.tv_nsec;
}

u64 time_bitmap(struct packed_radix_8_tree *tree, u64 *list)
{
	struct timespec begin, end;
	u64 bit, count = 0;

	clock_gettime(CLOCK_MONOTONIC, &begin);
	for_each_packed_radix_8_tree_bit(tree, bit)
		list[count++] = bit;
	clock_gettime(CLOCK_MONOTONIC, &end);

	return (end.tv_sec - begin.tv_sec)*1000000000 + end.tv_nsec - begin.tv_nsec;
}

u64 time_list(u64 *list, u64 size, u64 *dest)
{
	struct timespec begin, end;
	u64 i, list_result = 0;

	clock_gettime(CLOCK_MONOTONIC, &begin);
	for (i = 0; i < size; i++)
		dest[i] = list[i];
	clock_gettime(CLOCK_MONOTONIC, &end);

	return (end.tv_sec - begin.tv_sec)*1000000000 + end.tv_nsec - begin.tv_nsec;
}

int main(int argc, char **argv) {
	u64 size = strtoul(argv[1], NULL, 10);
	u64 i, count = strtoul(argv[2], NULL, 10);
	u64 *list = malloc(sizeof(*list) * count);
	u64 *dest = malloc(sizeof(*dest) * count);
	struct packed_radix_8_tree *tree = packed_radix_8_tree_alloc(size);
	u64 tl, tt, tb;

	printf("kB size l:%"PRIu64" t:%"PRIu64" b:%"PRIu64"\n",
	       sizeof(*list) * count / 1024,
	       (tree->bitmap_size + tree->bitmap_offset) / 8 / 1024,
	       size / 8 / 1024);

	srand(size + count);

	for (i = 0; i < count; i++) {
		u64 val = argc > 3 ? i : rand() % size;
		packed_radix_8_tree_insert(tree, val);
		list[i] = val;
	}

	tl = time_list(list, count, dest);
	tt = time_tree(tree, dest);
	tb = time_bitmap(tree, dest);

	printf("l: %10"PRIu64" ns (%4.2f, %6.2f, %8.2f)\n", tl, 1.0, (float)tt/tl, (float)tb/tl);
	printf("t: %10"PRIu64" ns (%4.2f, %6.2f, %8.2f)\n", tt, (float)tl/tt, 1.0, (float)tb/tt);
	printf("b: %10"PRIu64" ns (%4.2f, %6.2f, %8.2f)\n", tb, (float)tl/tb, (float)tt/tb, 1.0);
}
Kai Huang May 6, 2016, 9:46 a.m. UTC | #17
On Thu, May 5, 2016 at 6:57 AM, Radim Kr?má? <rkrcmar@redhat.com> 
wrote:
> 2016-05-04 18:33+0000, Cao, Lei:
>>  On 5/4/2016 1:15 PM, Cao, Lei wrote:
>>>  On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>>>>  Good designs so far seem to be:
>>>>   memslot -> lockless radix tree
>>>>  and
>>>>   vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>> 
>>> 
>>>  There is no need for lookup, the dirty log is fetched in sequence, 
>>> so why use
>>>  radix tree with added complexity but no benefit?
>>> 
>>>  List can be designed to be lockless, so memslot -> lockless fixed 
>>> list?
>> 
>>  Never mind, lookup is needed to avoid duplicates. We can use 
>> list+bitmap, but
>>  it's obviously not as efficient as radix tree.
> 
> Are duplicates a significant problem?
> 
> (The dirtied page is marked as dirty, so we should have zero to very 
> few
>  duplicates, depending on how dirtying and vm-exit on write to clean 
> page
>  cooperate.  Duplicates don't introduce any bugs and we could also 
> check
>  last few entries in the list to weed out most likely cases.)

I don't think duplicated pages are significant problem. The point is we 
don't lose pages.

I actually don't quite follow why there will be duplicated pages. Is it 
because lockless thing?

Thanks,
-Kai

> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 6, 2016, 12:09 p.m. UTC | #18
2016-05-06 21:46+1200, Kai Huang:
> On Thu, May 5, 2016 at 6:57 AM, Radim Kr?má? <rkrcmar@redhat.com> wrote:
>> 2016-05-04 18:33+0000, Cao, Lei:
>> >  On 5/4/2016 1:15 PM, Cao, Lei wrote:
>> > >  On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>> > > >  Good designs so far seem to be:
>> > > >   memslot -> lockless radix tree
>> > > >  and
>> > > >   vcpu -> memslot -> list  (memslot -> vcpu -> list)
>> > > > 
>> > > 
>> > >  There is no need for lookup, the dirty log is fetched in
>> > > sequence, so why use
>> > >  radix tree with added complexity but no benefit?
>> > > 
>> > >  List can be designed to be lockless, so memslot -> lockless
>> > > fixed list?
>> > 
>> >  Never mind, lookup is needed to avoid duplicates. We can use
>> > list+bitmap, but
>> >  it's obviously not as efficient as radix tree.
>> 
>> Are duplicates a significant problem?
>> 
>> (The dirtied page is marked as dirty, so we should have zero to very few
>>  duplicates, depending on how dirtying and vm-exit on write to clean
>> page
>>  cooperate.  Duplicates don't introduce any bugs and we could also check
>>  last few entries in the list to weed out most likely cases.)
> 
> I don't think duplicated pages are significant problem. The point is we
> don't lose pages.
> 
> I actually don't quite follow why there will be duplicated pages. Is it
> because lockless thing?

Not really.  The ugly bitmap to avoid duplicates in this series makes me
think that hardware does force an exit on multiple VCPUs if they
concurrently write to the same clear page.  A list, or any per-vcpu
structure, will have multiple dirty entries if that happens.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 6, 2016, 3:13 p.m. UTC | #19
On 5/6/2016 8:09 AM, Radim Kr?má? wrote:
> 2016-05-06 21:46+1200, Kai Huang:
>> On Thu, May 5, 2016 at 6:57 AM, Radim Kr?má? <rkrcmar@redhat.com> wrote:
>>> 2016-05-04 18:33+0000, Cao, Lei:
>>>>  On 5/4/2016 1:15 PM, Cao, Lei wrote:
>>>>>  On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>>>>>>  Good designs so far seem to be:
>>>>>>   memslot -> lockless radix tree
>>>>>>  and
>>>>>>   vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>>>>
>>>>>
>>>>>  There is no need for lookup, the dirty log is fetched in
>>>>> sequence, so why use
>>>>>  radix tree with added complexity but no benefit?
>>>>>
>>>>>  List can be designed to be lockless, so memslot -> lockless
>>>>> fixed list?
>>>>
>>>>  Never mind, lookup is needed to avoid duplicates. We can use
>>>> list+bitmap, but
>>>>  it's obviously not as efficient as radix tree.
>>>
>>> Are duplicates a significant problem?
>>>
>>> (The dirtied page is marked as dirty, so we should have zero to very few
>>>  duplicates, depending on how dirtying and vm-exit on write to clean
>>> page
>>>  cooperate.  Duplicates don't introduce any bugs and we could also check
>>>  last few entries in the list to weed out most likely cases.)
>>
>> I don't think duplicated pages are significant problem. The point is we
>> don't lose pages.
>>
>> I actually don't quite follow why there will be duplicated pages. Is it
>> because lockless thing?
> 
> Not really.  The ugly bitmap to avoid duplicates in this series makes me
> think that hardware does force an exit on multiple VCPUs if they
> concurrently write to the same clear page.  A list, or any per-vcpu
> structure, will have multiple dirty entries if that happens.
> 

We do see tons of duplicates. Majority are from kvm updating guest time,
steal time, and setting/clearing PV EOI.

Duplicates don't introduce bugs, but are undesirable, given a fixed-size
dirty log.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 6, 2016, 3:19 p.m. UTC | #20
On 5/5/2016 12:26 PM, Radim Kr?má? wrote:
> 2016-05-04 21:27+0200, Radim Kr?má?:
>> And I completely forgot that we can preallocate the whole tree and use a
>> effective packed storage thanks to that.  My first guess is that it
>> would be make sense with double the memory of our bitmap.  Scans and
>> insertion would be slower than for a per-vcpu list, but much faster than
>> with a dynamically allocated structure.  I'll think a bit about that.
> 
> The preallocate allocated radix tree has
>  * fast insertion (few atomic bit writes at places that are computed
>    only with arithmetics = no pointer traversal)
>  * feasible lockless removal (not sure how fast it will be, but it only
>    slows down the read side, and if we can guarantee that all vcpus are
>    stopped, then it's trivial)
>  * compatible with bitmap (a subset of the tree is 1:1 legacy bitmap)
>  * very fast search if pages are allocated in the same range (on par
>    with list)
>  * only ~2 times slower search that bitmap when the bitmap is full
>  * high memory consumption (roughly double the bitmap)
> 
> The only reason why a dynamically allocated tree would be used instead
> is unacceptable memory consumption, but we already lived with a bitmap,
> so I think it is worth a try.
> 
> I wrote a dirty, buggy an unoptimized userspace implementation.  (Code
> attached if you're interested.) Especially the bitmap is crude, so real
> numbers will look differently, but asymptotic behaviour should remain.
> 
> If we consider a slot with 10000000 pages, which is 38 GB of tracked
> memory, and datasets from 100 to 10000000 random dirty frames, then
> timing for codes that compact [l]ist, [t]ree and [b]itmap into a dirty
> list, are:
> 
>   % ./radix_tree 10000000 100
>   kB size l:0 t:1513 b:1220
>   l:        642 ns (1.00,  78.05, 45977.38)
>   t:      50108 ns (0.01,   1.00,   589.08)
>   b:   29517475 ns (0.00,   0.00,     1.00)
> 
>   % ./radix_tree 10000000 1000
>   l:       9542 ns (1.00,  43.29,  3347.11)
>   t:     413110 ns (0.02,   1.00,    77.31)
>   b:   31938143 ns (0.00,   0.01,     1.00)
> 
>   % ./radix_tree 10000000 10000
>   l:     105859 ns (1.00,  21.07,   273.75)
>   t:    2230429 ns (0.05,   1.00,    12.99)
>   b:   28978510 ns (0.00,   0.08,     1.00)
> 
>   % ./radix_tree 10000000 100000
>   l:     984729 ns (1.00,  12.16,    25.53)
>   t:   11970068 ns (0.08,   1.00,     2.10)
>   b:   25144204 ns (0.04,   0.48,     1.00)
> 
>   % ./radix_tree 10000000 1000000
>   kB size l:7812 t:1513 b:1220
>   l:    6420041 ns (1.00,   5.69,     4.13)
>   t:   36511925 ns (0.18,   1.00,     0.73)
>   b:   26523022 ns (0.24,   1.38,     1.00)
> 
>   % ./radix_tree 10000000 10000000
>   kB size l:78125 t:1513 b:1220
>   l:   38180627 ns (1.00,   1.56,     1.31)
>   t:   59681915 ns (0.64,   1.00,     0.84)
>   b:   50079550 ns (0.76,   1.19,     1.00)
> 
> The tree is 589.08 times faster than bitmap with 100 "dirty pages" and
> list is 78.05 times faster than the tree.  Speedups get smaller as the
> increases and the tree ends up performing worse than a bitmap.
> 
> The tree performs much better when the dataset isn't random, i.e. dirty
> frames are a seqence from 0 to the size of the dataset:
> 
>   % ./radix_tree 10000000 1000 seq
>   l:       8928 ns (1.00,   1.93,  3366.93)
>   t:      17198 ns (0.52,   1.00,  1747.88)
>   b:   30059972 ns (0.00,   0.00,     1.00)
> 
>   % ./radix_tree 10000000 10000 seq
>   l:      96469 ns (1.00,   1.36,   271.40)
>   t:     131124 ns (0.74,   1.00,   199.67)
>   b:   26181839 ns (0.00,   0.01,     1.00)
> 
>   % ./radix_tree 10000000 100000 seq
>   l:    1256119 ns (1.00,   1.28,    24.71)
>   t:    1609232 ns (0.78,   1.00,    19.29)
>   b:   31036003 ns (0.04,   0.05,     1.00)
> 

Interesting. I'll do some prototyping with radix tree to see how it
performs in our environment.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář May 6, 2016, 4:04 p.m. UTC | #21
2016-05-06 15:13+0000, Cao, Lei:
> On 5/6/2016 8:09 AM, Radim Kr?má? wrote:
>> 2016-05-06 21:46+1200, Kai Huang:
>>> On Thu, May 5, 2016 at 6:57 AM, Radim Kr?má? <rkrcmar@redhat.com> wrote:
>>>> 2016-05-04 18:33+0000, Cao, Lei:
>>>>>  On 5/4/2016 1:15 PM, Cao, Lei wrote:
>>>>>>  On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>>>>>>>  Good designs so far seem to be:
>>>>>>>   memslot -> lockless radix tree
>>>>>>>  and
>>>>>>>   vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>>>>>
>>>>>>
>>>>>>  There is no need for lookup, the dirty log is fetched in
>>>>>> sequence, so why use
>>>>>>  radix tree with added complexity but no benefit?
>>>>>>
>>>>>>  List can be designed to be lockless, so memslot -> lockless
>>>>>> fixed list?
>>>>>
>>>>>  Never mind, lookup is needed to avoid duplicates. We can use
>>>>> list+bitmap, but
>>>>>  it's obviously not as efficient as radix tree.
>>>>
>>>> Are duplicates a significant problem?
>>>>
>>>> (The dirtied page is marked as dirty, so we should have zero to very few
>>>>  duplicates, depending on how dirtying and vm-exit on write to clean
>>>> page
>>>>  cooperate.  Duplicates don't introduce any bugs and we could also check
>>>>  last few entries in the list to weed out most likely cases.)
>>>
>>> I don't think duplicated pages are significant problem. The point is we
>>> don't lose pages.
>>>
>>> I actually don't quite follow why there will be duplicated pages. Is it
>>> because lockless thing?
>> 
>> Not really.  The ugly bitmap to avoid duplicates in this series makes me
>> think that hardware does force an exit on multiple VCPUs if they
>> concurrently write to the same clear page.  A list, or any per-vcpu
>> structure, will have multiple dirty entries if that happens.
>> 
> 
> We do see tons of duplicates. Majority are from kvm updating guest time,
> steal time, and setting/clearing PV EOI.
> 
> Duplicates don't introduce bugs, but are undesirable, given a fixed-size
> dirty log.

Absolutely.  I didn't realize there were this many duplicates and no
method of filtering seems better than the bitmap.
Looking forward to performance comparison with a tree!

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kai Huang May 7, 2016, 1:48 a.m. UTC | #22
On 05/07/2016 12:09 AM, Radim Kr?má? wrote:
> 2016-05-06 21:46+1200, Kai Huang:
>> On Thu, May 5, 2016 at 6:57 AM, Radim Kr?má? <rkrcmar@redhat.com> wrote:
>>> 2016-05-04 18:33+0000, Cao, Lei:
>>>>   On 5/4/2016 1:15 PM, Cao, Lei wrote:
>>>>>   On 5/4/2016 9:13 AM, Radim Kr?má? wrote:
>>>>>>   Good designs so far seem to be:
>>>>>>    memslot -> lockless radix tree
>>>>>>   and
>>>>>>    vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>>>>
>>>>>   There is no need for lookup, the dirty log is fetched in
>>>>> sequence, so why use
>>>>>   radix tree with added complexity but no benefit?
>>>>>
>>>>>   List can be designed to be lockless, so memslot -> lockless
>>>>> fixed list?
>>>>   Never mind, lookup is needed to avoid duplicates. We can use
>>>> list+bitmap, but
>>>>   it's obviously not as efficient as radix tree.
>>> Are duplicates a significant problem?
>>>
>>> (The dirtied page is marked as dirty, so we should have zero to very few
>>>   duplicates, depending on how dirtying and vm-exit on write to clean
>>> page
>>>   cooperate.  Duplicates don't introduce any bugs and we could also check
>>>   last few entries in the list to weed out most likely cases.)
>> I don't think duplicated pages are significant problem. The point is we
>> don't lose pages.
>>
>> I actually don't quite follow why there will be duplicated pages. Is it
>> because lockless thing?
> Not really.  The ugly bitmap to avoid duplicates in this series makes me
> think that hardware does force an exit on multiple VCPUs if they
> concurrently write to the same clear page.  A list, or any per-vcpu
> structure, will have multiple dirty entries if that happens.
Oh yes. This makes sense. This also counts a disadvantage of per-vcpu 
structure.

Thanks,
-Kai
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei May 24, 2016, 5:19 p.m. UTC | #23
On 5/6/2016 12:04 PM, Radim Krčmář wrote:
> 2016-05-06 15:13+0000, Cao, Lei:
>> On 5/6/2016 8:09 AM, Radim Krčmář wrote:
>>> 2016-05-06 21:46+1200, Kai Huang:
>>>> On Thu, May 5, 2016 at 6:57 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>>>>> 2016-05-04 18:33+0000, Cao, Lei:
>>>>>>  On 5/4/2016 1:15 PM, Cao, Lei wrote:
>>>>>>>  On 5/4/2016 9:13 AM, Radim Krčmář wrote:
>>>>>>>>  Good designs so far seem to be:
>>>>>>>>   memslot -> lockless radix tree
>>>>>>>>  and
>>>>>>>>   vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>>>>>>
>>>>>>>
>>>>>>>  There is no need for lookup, the dirty log is fetched in
>>>>>>> sequence, so why use
>>>>>>>  radix tree with added complexity but no benefit?
>>>>>>>
>>>>>>>  List can be designed to be lockless, so memslot -> lockless
>>>>>>> fixed list?
>>>>>>
>>>>>>  Never mind, lookup is needed to avoid duplicates. We can use
>>>>>> list+bitmap, but
>>>>>>  it's obviously not as efficient as radix tree.
>>>>>
>>>>> Are duplicates a significant problem?
>>>>>
>>>>> (The dirtied page is marked as dirty, so we should have zero to very few
>>>>>  duplicates, depending on how dirtying and vm-exit on write to clean
>>>>> page
>>>>>  cooperate.  Duplicates don't introduce any bugs and we could also check
>>>>>  last few entries in the list to weed out most likely cases.)
>>>>
>>>> I don't think duplicated pages are significant problem. The point is we
>>>> don't lose pages.
>>>>
>>>> I actually don't quite follow why there will be duplicated pages. Is it
>>>> because lockless thing?
>>>
>>> Not really.  The ugly bitmap to avoid duplicates in this series makes me
>>> think that hardware does force an exit on multiple VCPUs if they
>>> concurrently write to the same clear page.  A list, or any per-vcpu
>>> structure, will have multiple dirty entries if that happens.
>>>
>>
>> We do see tons of duplicates. Majority are from kvm updating guest time,
>> steal time, and setting/clearing PV EOI.
>>
>> Duplicates don't introduce bugs, but are undesirable, given a fixed-size
>> dirty log.
> 
> Absolutely.  I didn't realize there were this many duplicates and no
> method of filtering seems better than the bitmap.
> Looking forward to performance comparison with a tree!
> 
> Thanks.
> 

A couple of issues with the tree.

It doesn't work well if new entries are inserted after a scan. Rescan picks
up everything, not just the new entries. We acquire and preprocess dirty pages
before the guest is quiesced, and do a final pass after the guest is paused.

It's difficult to divide the tree up so that multiple threads can work on
it at the same time. It comes into play when resetting all the write traps
after the guest is paused. A list can be divided up and multiple threads can
work its part of the list. It's difficult to do that with a tree.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cao, Lei June 30, 2016, 1:49 p.m. UTC | #24
On 5/6/2016 12:04 PM, Radim Krčmář wrote:
> 2016-05-06 15:13+0000, Cao, Lei:
>> On 5/6/2016 8:09 AM, Radim Krčmář wrote:
>>> 2016-05-06 21:46+1200, Kai Huang:
>>>> On Thu, May 5, 2016 at 6:57 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>>>>> 2016-05-04 18:33+0000, Cao, Lei:
>>>>>>  On 5/4/2016 1:15 PM, Cao, Lei wrote:
>>>>>>>  On 5/4/2016 9:13 AM, Radim Krčmář wrote:
>>>>>>>>  Good designs so far seem to be:
>>>>>>>>   memslot -> lockless radix tree
>>>>>>>>  and
>>>>>>>>   vcpu -> memslot -> list  (memslot -> vcpu -> list)
>>>>>>>>
>>>>>>>
>>>>>>>  There is no need for lookup, the dirty log is fetched in
>>>>>>> sequence, so why use
>>>>>>>  radix tree with added complexity but no benefit?
>>>>>>>
>>>>>>>  List can be designed to be lockless, so memslot -> lockless
>>>>>>> fixed list?
>>>>>>
>>>>>>  Never mind, lookup is needed to avoid duplicates. We can use
>>>>>> list+bitmap, but
>>>>>>  it's obviously not as efficient as radix tree.
>>>>>
>>>>> Are duplicates a significant problem?
>>>>>
>>>>> (The dirtied page is marked as dirty, so we should have zero to very few
>>>>>  duplicates, depending on how dirtying and vm-exit on write to clean
>>>>> page
>>>>>  cooperate.  Duplicates don't introduce any bugs and we could also check
>>>>>  last few entries in the list to weed out most likely cases.)
>>>>
>>>> I don't think duplicated pages are significant problem. The point is we
>>>> don't lose pages.
>>>>
>>>> I actually don't quite follow why there will be duplicated pages. Is it
>>>> because lockless thing?
>>>
>>> Not really.  The ugly bitmap to avoid duplicates in this series makes me
>>> think that hardware does force an exit on multiple VCPUs if they
>>> concurrently write to the same clear page.  A list, or any per-vcpu
>>> structure, will have multiple dirty entries if that happens.
>>>
>>
>> We do see tons of duplicates. Majority are from kvm updating guest time,
>> steal time, and setting/clearing PV EOI.
>>
>> Duplicates don't introduce bugs, but are undesirable, given a fixed-size
>> dirty log.
> 
> Absolutely.  I didn't realize there were this many duplicates and no
> method of filtering seems better than the bitmap.
> Looking forward to performance comparison with a tree!
> 
> Thanks.
> 

I redid our memory tracking with tree per memslot, and ran a couple of
benchmarks (webcat and dvdstore) to gauge the performance impact. The result
is not promising. Memory tracking with tree per memslot is 20% less 
performant than memory tracking with list per vcpu.

I am going to experiment with memslot -> fixed list with spinlock.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b7e3944..52bff2b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1030,6 +1030,11 @@  void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
+void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm);
+void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm);
+int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset);
+gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset);
+
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1ff4dbb..a36475a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1443,6 +1443,58 @@  restart:
 	return 0;
 }
 
+static struct kvm_memory_slot *kvm_memslot_from_id(struct kvm *kvm, int slot_id)
+{
+	int i;
+	struct kvm_memory_slot *memslot;
+	struct kvm_memslots *slots;
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(memslot, slots) {
+			if (memslot->id == slot_id)
+				return memslot;
+		}
+	}
+	return NULL;
+}
+
+gfn_t kvm_mt_slot_offset_to_gfn(struct kvm *kvm, u64 slot_offset)
+{
+	struct kvm_memory_slot *slot;
+	int slot_id;
+	gfn_t offset;
+
+	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
+	slot = kvm_memslot_from_id(kvm, slot_id);
+	if (slot == NULL) {
+		pr_warn("KVM: bad slot_id %d\n", slot_id);
+		return kvm->mt.max_gfn+1;
+	}
+	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
+	return offset + slot->base_gfn;
+}
+
+int kvm_mt_mmu_reset_gfn(struct kvm *kvm, u64 slot_offset)
+{
+	struct kvm_memory_slot *slot;
+	int slot_id;
+	gfn_t offset, gfn;
+
+	slot_id = MT_SLOT_FROM_SLOT_OFFSET(slot_offset);
+	slot = kvm_memslot_from_id(kvm, slot_id);
+	offset  = MT_OFFSET_FROM_SLOT_OFFSET(slot_offset);
+	gfn = offset + slot->base_gfn;
+
+	if (gfn > kvm->mt.max_gfn) {
+		pr_warn("KVM: bad gfn %lx\n", (long)gfn);
+		return 0;
+	}
+
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot, offset, 1);
+	return 1;
+}
+
 struct slot_rmap_walk_iterator {
 	/* input fields. */
 	struct kvm_memory_slot *slot;
@@ -4762,6 +4814,47 @@  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 		kvm_flush_remote_tlbs(kvm);
 }
 
+void kvm_mmu_mt_enable_log_dirty(struct kvm *kvm)
+{
+	int i;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+
+		kvm_for_each_memslot(memslot, slots) {
+			if (memslot->id < KVM_USER_MEM_SLOTS) {
+				if (kvm_x86_ops->slot_enable_log_dirty)
+					kvm_x86_ops->slot_enable_log_dirty(kvm,
+						memslot);
+				else
+					kvm_mmu_slot_remove_write_access(kvm,
+						memslot);
+			}
+		}
+	}
+}
+
+void kvm_mmu_mt_disable_log_dirty(struct kvm *kvm)
+{
+	int i;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+
+		kvm_for_each_memslot(memslot, slots) {
+			if (memslot->id < KVM_USER_MEM_SLOTS) {
+				if (kvm_x86_ops->slot_disable_log_dirty)
+					kvm_x86_ops->slot_disable_log_dirty(kvm,
+						memslot);
+			}
+		}
+	}
+}
+
 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 					 struct kvm_rmap_head *rmap_head)
 {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2bce4db..736668d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1344,11 +1344,11 @@  struct mt_enable {
 #define MT_OFFSET_MASK		(0x0000ffffffffffffUL)
 
 #define MT_MAKE_SLOT_OFFSET(slot, offset)			\
-	do {							\
+	({							\
 		__u64 slot_off = offset & MT_OFFSET_MASK;	\
 		slot_off |= ((__u64)slot << 48);		\
 		slot_off;					\
-	} while (0)
+	})
 
 #define MT_OFFSET_FROM_SLOT_OFFSET(slot_off)		\
 	(slot_off & MT_OFFSET_MASK)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fe46067..ba99cbc6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1795,8 +1795,12 @@  int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
 
-static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
-			          const void *data, int offset, int len)
+static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
+	gfn_t gfn, struct kvm_vcpu *vcpu);
+
+static int __kvm_write_guest_page(struct kvm *kvm,
+				struct kvm_memory_slot *memslot, gfn_t gfn,
+				const void *data, int offset, int len)
 {
 	int r;
 	unsigned long addr;
@@ -1808,6 +1812,8 @@  static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(memslot, gfn);
+	if (memslot && (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS))
+		mt_mark_page_dirty(kvm, memslot, gfn, NULL);
 	return 0;
 }
 
@@ -1816,7 +1822,7 @@  int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 
@@ -1825,7 +1831,7 @@  int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
 
@@ -1929,6 +1935,10 @@  int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+	if (ghc->memslot && (ghc->memslot->id >= 0 &&
+		ghc->memslot->id < KVM_USER_MEM_SLOTS))
+		mt_mark_page_dirty(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT,
+			NULL);
 
 	return 0;
 }
@@ -1996,11 +2006,95 @@  static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
 	}
 }
 
+/*
+ * We have some new dirty pages for our sublist waiter.  Enough to merit
+ * waking it up?
+ */
+static void mt_sw_add_pages(struct kvm *kvm)
+{
+	int avail = kvm->mt.tot_pages - kvm->mt.fetch_count;
+	struct sublist_waiter *swp = &kvm->mt.sw;
+
+	spin_lock(&kvm->mt.sw_lock);
+
+	if (swp->goal && (avail >= swp->goal)) {
+		kvm->mt.fetch_count += avail;
+		swp->goal = 0;
+		wake_up(&swp->wq);
+	}
+
+	spin_unlock(&kvm->mt.sw_lock);
+}
+
+#define DIRTY_GFN_ADD_GRANULARITY      (256)
+
+static void mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot,
+	gfn_t gfn, struct kvm_vcpu *vcpu)
+{
+	int use_kvm;            /* add to global list? */
+	struct gfn_list *gfnlist;
+	int slot_id = slot->id;
+	__u64 offset = gfn - slot->base_gfn;
+	__u64 slot_offset;
+
+	/*
+	 * Try to add dirty page to vcpu list.  If vcpu is NULL or
+	 * vcpu list is full, then try to add to kvm master list.
+	 */
+
+	if (!kvm->mt.active)
+		return;
+
+	if (slot->id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	if (gfn > kvm->mt.max_gfn)
+		return;
+
+	/* if we're avoiding duplicates, is this one already marked? */
+	if (kvm->mt.bmap && test_and_set_bit(gfn, kvm->mt.bmap))
+		return;
+
+	slot_offset = MT_MAKE_SLOT_OFFSET(slot_id, offset);
+
+	use_kvm = (vcpu == NULL);
+
+	if (vcpu) {
+		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+		if (gfnlist->dirty_index == gfnlist->max_dirty) {
+			use_kvm = 1;
+			gfnlist->overflow = 1;
+			/* Fall back to master gfn list.*/
+			gfnlist = &kvm->mt.gfn_list;
+		}
+	} else {
+		gfnlist = &kvm->mt.gfn_list;
+	}
+
+	spin_lock(&gfnlist->lock);
+	if (gfnlist->dirty_index >= gfnlist->max_dirty) {
+		gfnlist->overflow = 1;
+	} else {
+		gfnlist->dirty_gfns[gfnlist->dirty_index++] = slot_offset;
+		if ((gfnlist->dirty_index % DIRTY_GFN_ADD_GRANULARITY) == 0) {
+			spin_lock(&kvm->mt.lock);
+			kvm->mt.tot_pages += DIRTY_GFN_ADD_GRANULARITY;
+			mt_sw_add_pages(kvm);
+			spin_unlock(&kvm->mt.lock);
+		}
+	}
+	spin_unlock(&gfnlist->lock);
+}
+
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot;
 
 	memslot = gfn_to_memslot(kvm, gfn);
+	if (memslot) {
+		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
+			mt_mark_page_dirty(kvm, memslot, gfn, NULL);
+	}
 	mark_page_dirty_in_slot(memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
@@ -2010,6 +2104,10 @@  void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if (memslot) {
+		if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS)
+			mt_mark_page_dirty(vcpu->kvm, memslot, gfn, vcpu);
+	}
 	mark_page_dirty_in_slot(memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
@@ -2823,8 +2921,6 @@  static u64 kvm_get_max_gfn(struct kvm *kvm)
 	return num_gfn - 1;
 }
 
-#define DIRTY_GFN_ADD_GRANULARITY      (256)
-
 /*
  * Return a the smallest multiple of DIRTY_GFN_ADD_GRANULARITY that is >= goal.
  */
@@ -3010,31 +3106,523 @@  static int kvm_vm_ioctl_mt_init(struct kvm *kvm, struct mt_setup *mts)
 		return -EINVAL;
 }
 
+static int kvm_enable_mt(struct kvm *kvm)
+{
+	int rc = 0;
+
+	if (kvm->mt.active) {
+		pr_warn("KVM: vm %d, MT already active\n",
+			current->pid);
+		rc = -EINVAL;
+		goto enable_mt_done;
+	}
+
+	kvm_mmu_mt_enable_log_dirty(kvm);
+	if (kvm->mt.bmap)
+		memset(kvm->mt.bmap, 0, kvm->mt.bmapsz);
+
+	kvm->mt.active = 1;
+
+enable_mt_done:
+
+	return rc;
+}
+
+static int kvm_disable_mt(struct kvm *kvm)
+{
+	int rc = 0;
+
+	if (!kvm->mt.active) {
+		pr_warn("KVM: vm %d, MT already disabled\n",
+			current->pid);
+		rc = -EINVAL;
+		goto disable_mt_done;
+	}
+
+	kvm_mmu_mt_disable_log_dirty(kvm);
+	kvm->mt.active = 0;
+
+disable_mt_done:
+
+	return rc;
+}
+
 static int kvm_vm_ioctl_mt_enable(struct kvm *kvm, struct mt_enable *mte)
 {
-	return -EINVAL;
+	if ((mte->flags & 0x1) == 1)
+		return kvm_enable_mt(kvm);
+	else if ((mte->flags & 0x1) == 0)
+		return kvm_disable_mt(kvm);
+	else
+		return -EINVAL;
 }
 
 static int kvm_vm_ioctl_mt_prepare_cp(struct kvm *kvm,
 				      struct mt_prepare_cp *mtpcp)
 {
-	return -EINVAL;
+	int i;
+	struct kvm_vcpu *vcpu;
+	struct gfn_list *gfnlist;
+
+	if (!kvm->mt.active)
+		return -EINVAL;
+
+	kvm->mt.cp_id = mtpcp->cpid;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+		spin_lock(&gfnlist->lock);
+		gfnlist->fetch_index = 0;
+		gfnlist->reset_index = 0;
+		gfnlist->dirty_index = 0;
+		gfnlist->overflow = 0;
+		spin_unlock(&gfnlist->lock);
+	}
+
+	gfnlist = &kvm->mt.gfn_list;
+	spin_lock(&gfnlist->lock);
+	gfnlist->fetch_index = 0;
+	gfnlist->reset_index = 0;
+	gfnlist->dirty_index = 0;
+	gfnlist->overflow = 0;
+	spin_unlock(&gfnlist->lock);
+
+	kvm->mt.quiesced = 0;
+	kvm->mt.allow_blocking = 1;
+	kvm->mt.tot_pages  = kvm->mt.fetch_count = 0;
+
+	return 0;
+}
+
+static bool mt_reset_gfn(struct kvm *kvm, u64 slot_offset)
+{
+	gfn_t gfn;
+
+	gfn = kvm_mt_slot_offset_to_gfn(kvm, slot_offset);
+	if (gfn > kvm->mt.max_gfn)
+		return 0;
+
+	if (kvm->mt.bmap) {
+		if (kvm->mt.quiesced) {
+			/*
+			 * Goal is to reset entire bmap, but don't need
+			 * atomics if we are quiesced
+			 */
+			int offset32 = gfn/32;
+			int *p = (int *)(kvm->mt.bmap) + offset32;
+			*p = 0;
+		} else {
+			clear_bit(gfn, kvm->mt.bmap);
+		}
+	}
+
+	return kvm_mt_mmu_reset_gfn(kvm, slot_offset);
+}
+
+#define GFN_RESET_BATCH        (64)
+
+static int mt_reset_all_gfns(struct kvm *kvm)
+{
+	int i, j;
+	struct kvm_vcpu *vcpu;
+	struct gfn_list *gfnlist;
+	bool cleared = false;
+	int reset_start, count, avail;
+
+	if (!kvm->mt.active)
+		return -EINVAL;
+
+	if (!kvm->mt.quiesced)
+		return -EINVAL;
+
+	spin_lock(&kvm->mmu_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+
+vcpu_gfn_loop:
+
+		spin_lock(&gfnlist->lock);
+		reset_start = gfnlist->reset_index;
+		avail = gfnlist->dirty_index - gfnlist->reset_index;
+		count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
+		gfnlist->reset_index += count;
+		spin_unlock(&gfnlist->lock);
+
+		for (j = reset_start; j < reset_start + count; j++)
+			cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
+
+		if (count)
+			goto vcpu_gfn_loop;
+	}
+
+	gfnlist = &kvm->mt.gfn_list;
+
+global_gfn_loop:
+
+	spin_lock(&gfnlist->lock);
+	reset_start = gfnlist->reset_index;
+	avail = gfnlist->dirty_index - gfnlist->reset_index;
+	count = avail > GFN_RESET_BATCH ? GFN_RESET_BATCH : avail;
+	gfnlist->reset_index += count;
+	spin_unlock(&gfnlist->lock);
+
+	for (j = reset_start; j < reset_start + count; j++)
+		cleared |= mt_reset_gfn(kvm, gfnlist->dirty_gfns[j]);
+
+	if (count)
+		goto global_gfn_loop;
+
+	spin_unlock(&kvm->mmu_lock);
+
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return 0;
 }
 
 static int kvm_vm_ioctl_mt_rearm_gfns(struct kvm *kvm)
 {
-	return -EINVAL;
+	return mt_reset_all_gfns(kvm);
+}
+
+static int mt_unblock_sw(struct kvm *kvm)
+{
+	struct sublist_waiter *swp;
+
+	if (!kvm->mt.active)
+		return -EINVAL;
+
+	spin_lock(&kvm->mt.sw_lock);
+
+	kvm->mt.allow_blocking = 0;
+
+	/* Make sure allow_blocking is clear before the wake up */
+	mb();
+
+	swp = &kvm->mt.sw;
+	wake_up(&swp->wq);
+
+	spin_unlock(&kvm->mt.sw_lock);
+
+	return 0;
 }
 
 static int kvm_vm_ioctl_mt_quiesced(struct kvm *kvm)
 {
-	return -EINVAL;
+	if (!kvm->mt.active)
+		return -EINVAL;
+
+	kvm->mt.quiesced = 1;
+
+	/* wake up the sublist waiter */
+	mt_unblock_sw(kvm);
+
+	if (kvm->mt.gfn_list.overflow)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mt_sublist_req_nowait(struct kvm *kvm,
+				struct mt_sublist_fetch_info *msfi, int offset)
+{
+	int i, j, avail, goal = msfi->gfn_info.count;
+	struct kvm_vcpu *vcpu;
+	__u64 *gfndst, *gfnsrc;
+	int rc = 0;
+	__u64 slot_offset;
+	int index;
+
+	/* Clearing dirty/write bits requires tlb flush before exit */
+	int cleared = 0;
+
+	/* Don't need to lock gfn lists if we're in VM blackout */
+	int need_locks = !kvm->mt.quiesced;
+
+	/* Consolidate flags */
+	int reset = msfi->flags & MT_FETCH_REARM;
+	int bmap = kvm->mt.bmap != NULL;
+
+	if (goal == 0)
+		return 0;
+
+	gfndst = &msfi->gfn_info.gfnlist[offset];
+	msfi->gfn_info.count = offset;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		int len, rem;
+		int vcpu_id;
+		struct gfn_list *gfnlist;
+
+		vcpu_id = vcpu->vcpu_id;
+		gfnlist = &vcpu->kvm->vcpu_mt[vcpu_id].gfn_list;
+
+		mutex_lock(&gfnlist->mtx);
+		if (need_locks)
+			spin_lock(&gfnlist->lock);
+
+		avail = gfnlist->dirty_index - gfnlist->fetch_index;
+		if (!avail) {
+			if (need_locks)
+				spin_unlock(&gfnlist->lock);
+				mutex_unlock(&gfnlist->mtx);
+			continue;
+		}
+		avail = avail > goal ? goal : avail;
+		for (j = 0; j < avail; j++) {
+			index = gfnlist->fetch_index+j;
+			slot_offset = gfnlist->dirty_gfns[index];
+			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
+						slot_offset);
+		}
+		gfnsrc = &kvm->mt.gfn_buf[0];
+
+		if (need_locks)
+			spin_unlock(&gfnlist->lock);
+
+		rem = copy_to_user(gfndst, gfnsrc,
+				avail*sizeof(*gfndst)) / sizeof(*gfndst);
+
+		/*
+		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
+		 * below, but must take mmu_lock _before_ gfnlist lock.
+		 */
+		if (reset)
+			spin_lock(&kvm->mmu_lock);
+
+		if (need_locks)
+			spin_lock(&gfnlist->lock);
+
+		len = avail - rem;
+		msfi->gfn_info.count += len;
+		gfndst += len;
+		if (reset) {
+			__u64 gfn;
+
+			for (j = 0; j < len; j++) {
+				index = gfnlist->fetch_index+j;
+				slot_offset = gfnlist->dirty_gfns[index];
+				gfn = kvm_mt_slot_offset_to_gfn(kvm,
+					slot_offset);
+				cleared +=
+					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
+				if (bmap)
+					clear_bit(gfn, kvm->mt.bmap);
+			}
+			gfnlist->reset_index += len;
+		}
+		gfnlist->fetch_index += len;
+
+		if (need_locks)
+			spin_unlock(&gfnlist->lock);
+		if (reset)
+			spin_unlock(&kvm->mmu_lock);
+		mutex_unlock(&gfnlist->mtx);
+
+		if (len != avail) {
+			rc = -EFAULT;
+			goto copy_done_err;
+		}
+
+		goal -= avail;
+		if (goal == 0)
+			break;
+	}
+
+	/* If we still need more gfns, consult the master list */
+	if (goal) {
+		int len, rem;
+		struct gfn_list *gfnlist = &kvm->mt.gfn_list;
+
+		mutex_lock(&gfnlist->mtx);
+		if (need_locks)
+			spin_lock(&gfnlist->lock);
+
+		avail = gfnlist->dirty_index - gfnlist->fetch_index;
+		if (!avail) {
+			if (need_locks)
+				spin_unlock(&gfnlist->lock);
+			mutex_unlock(&gfnlist->mtx);
+			goto copy_done_no_err;
+		}
+		avail = avail > goal ? goal : avail;
+		for (j = 0; j < avail; j++) {
+			index = gfnlist->fetch_index+j;
+			slot_offset = gfnlist->dirty_gfns[index];
+			kvm->mt.gfn_buf[j] = kvm_mt_slot_offset_to_gfn(kvm,
+						slot_offset);
+		}
+		gfnsrc = &kvm->mt.gfn_buf[0];
+
+		if (need_locks)
+			spin_unlock(&gfnlist->lock);
+
+		rem = copy_to_user(gfndst, gfnsrc,
+				avail*sizeof(*gfndst)) / sizeof(*gfndst);
+
+		/*
+		 * Need mmu_lock if we're going to do kvm_mt_mmu_reset_gfn
+		 * below, but must take mmu_lock _before_ gfnlist lock.
+		 */
+		if (reset)
+			spin_lock(&kvm->mmu_lock);
+
+		if (need_locks)
+			spin_lock(&gfnlist->lock);
+
+		len = avail - rem;
+		msfi->gfn_info.count += len;
+		gfnlist->fetch_index += len;
+		if (reset) {
+			__u64 slot_offset;
+			__u64 gfn;
+
+			for (j = 0; j < len; j++) {
+				index = gfnlist->fetch_index+j;
+				slot_offset = gfnlist->dirty_gfns[index];
+				gfn = kvm_mt_slot_offset_to_gfn(kvm,
+					slot_offset);
+				cleared +=
+					kvm_mt_mmu_reset_gfn(kvm, slot_offset);
+				if (bmap)
+					clear_bit(gfn, kvm->mt.bmap);
+			}
+			gfnlist->reset_index += len;
+		}
+
+		if (need_locks)
+			spin_unlock(&gfnlist->lock);
+		if (reset)
+			spin_unlock(&kvm->mmu_lock);
+		mutex_unlock(&gfnlist->mtx);
+
+		if (len != avail) {
+			rc = -EFAULT;
+			goto copy_done_err;
+		}
+
+		goal -= avail;
+	}
+
+copy_done_no_err:
+
+copy_done_err:
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return rc;
+}
+
+static int mt_sublist_req_wait(struct kvm *kvm,
+				struct mt_sublist_fetch_info *msfi)
+{
+	struct sublist_waiter *swp;
+	int goal = msfi->gfn_info.count;
+	int offset;
+	int rc;
+
+	if (msfi->gfn_info.count == 0)
+		return 0;
+
+	spin_lock(&kvm->mt.sw_lock);
+	if (!kvm->mt.allow_blocking) {
+		spin_unlock(&kvm->mt.sw_lock);
+		return -EINVAL;
+	}
+	spin_unlock(&kvm->mt.sw_lock);
+
+	rc = mt_sublist_req_nowait(kvm, msfi, 0);
+	if (rc || (msfi->gfn_info.count == goal))
+		return rc;
+
+	offset = msfi->gfn_info.count;
+
+	spin_lock(&kvm->mt.sw_lock);
+
+	if (kvm->mt.sw_busy) {
+		spin_unlock(&kvm->mt.sw_lock);
+		return -EBUSY;
+	}
+	kvm->mt.sw_busy = 1;
+
+	swp = &kvm->mt.sw;
+	swp->goal = goal;
+
+	spin_unlock(&kvm->mt.sw_lock);
+
+	rc = wait_event_interruptible(swp->wq,
+			!kvm->mt.allow_blocking || !swp->goal);
+
+	spin_lock(&kvm->mt.sw_lock);
+
+	kvm->mt.sw_busy = 0;
+
+	spin_unlock(&kvm->mt.sw_lock);
+
+	if (rc)
+		return rc;
+
+	msfi->gfn_info.count = goal - offset;
+
+	return mt_sublist_req_nowait(kvm, msfi, offset);
+}
+
+static int mt_get_dirty_count(struct kvm *kvm,
+				struct mt_sublist_fetch_info *msfi)
+{
+	int i, avail = 0;
+	struct kvm_vcpu *vcpu;
+	struct gfn_list *gfnlist;
+
+	/* Don't need to lock gfn lists if we're in VM blackout */
+	int need_locks = !kvm->mt.quiesced;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		gfnlist = &vcpu->kvm->vcpu_mt[vcpu->vcpu_id].gfn_list;
+
+		mutex_lock(&gfnlist->mtx);
+		if (need_locks)
+			spin_lock(&gfnlist->lock);
+		avail += gfnlist->dirty_index - gfnlist->fetch_index;
+		if (need_locks)
+			spin_unlock(&gfnlist->lock);
+		mutex_unlock(&gfnlist->mtx);
+	}
+
+	gfnlist = &kvm->mt.gfn_list;
+
+	mutex_lock(&gfnlist->mtx);
+	if (need_locks)
+		spin_lock(&gfnlist->lock);
+	avail += gfnlist->dirty_index - gfnlist->fetch_index;
+	if (need_locks)
+		spin_unlock(&gfnlist->lock);
+	mutex_unlock(&gfnlist->mtx);
+
+	msfi->gfn_info.count = avail;
+
+	return 0;
 }
 
 static int kvm_vm_ioctl_mt_sublist_fetch(struct kvm *kvm,
 					 struct mt_sublist_fetch_info *mtsfi)
 {
-	return -EINVAL;
+	if (!kvm->mt.active)
+		return -EINVAL;
+
+	if (mtsfi->gfn_info.gfnlist == NULL)
+		return mt_get_dirty_count(kvm, mtsfi);
+
+	if (mtsfi->gfn_info.count == 0)
+		return 0;
+
+	if (!(mtsfi->flags & MT_FETCH_WAIT))
+		return mt_sublist_req_nowait(kvm, mtsfi, 0);
+
+	return mt_sublist_req_wait(kvm, mtsfi);
 }
 
 static int kvm_vm_ioctl_mt_dirty_trigger(struct kvm *kvm, int dirty_trigger)