diff mbox

mmu_notifier, kvm: Introduce dirty bit tracking in spte and mmu notifier to help KSM dirty bit tracking

Message ID 201106212132.39311.nai.xia@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nai Xia June 21, 2011, 1:32 p.m. UTC
Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
significant performance gain in volatile pages scanning in KSM.
Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
enabled to indicate that the dirty bits of underlying sptes are not updated by
hardware.

Signed-off-by: Nai Xia <nai.xia@gmail.com>
Acked-by: Izik Eidus <izik.eidus@ravellosystems.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h              |    3 +-
 arch/x86/kvm/vmx.c              |    1 +
 include/linux/kvm_host.h        |    2 +-
 include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
 mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
 virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
 8 files changed, 149 insertions(+), 2 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Chris Wright June 22, 2011, 12:21 a.m. UTC | #1
* Nai Xia (nai.xia@gmail.com) wrote:
> Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
> and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
> significant performance gain in volatile pages scanning in KSM.
> Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
> enabled to indicate that the dirty bits of underlying sptes are not updated by
> hardware.

Did you test with each of EPT, NPT and shadow?

> Signed-off-by: Nai Xia <nai.xia@gmail.com>
> Acked-by: Izik Eidus <izik.eidus@ravellosystems.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu.h              |    3 +-
>  arch/x86/kvm/vmx.c              |    1 +
>  include/linux/kvm_host.h        |    2 +-
>  include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
>  mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
>  8 files changed, 149 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index d2ac8e2..f0d7aa0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -848,6 +848,7 @@ extern bool kvm_rebooting;
>  int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
> +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
>  void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
>  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index aee3862..a5a0c51 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -979,6 +979,37 @@ out:
>  	return young;
>  }
>  
> +/*
> + * Caller is supposed to SetPageDirty(), it's not done inside this.
> + */
> +static
> +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
> +				   unsigned long data)
> +{
> +	u64 *spte;
> +	int dirty = 0;
> +
> +	if (!shadow_dirty_mask) {
> +		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
> +		goto out;
> +	}

This should never fire with the dirty_update() notifier test, right?
And that means that this whole optimization is for the shadow mmu case,
arguably the legacy case.

> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _dirty;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> +		_dirty = _spte & PT_DIRTY_MASK;
> +		if (_dirty) {
> +			dirty = 1;
> +			clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);

Is this sufficient (not losing dirty state ever)?

> +		}
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +out:
> +	return dirty;
> +}
> +
>  #define RMAP_RECYCLE_THRESHOLD 1000
>  
>  static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> @@ -1004,6 +1035,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
>  	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
>  
>  
> +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	return kvm_handle_hva(kvm, hva, 0, kvm_test_and_clear_dirty_rmapp);
> +}
> +
>  #ifdef MMU_DEBUG
>  static int is_empty_shadow_page(u64 *spt)
>  {
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 7086ca8..b8d01c3 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -18,7 +18,8 @@
>  #define PT_PCD_MASK (1ULL << 4)
>  #define PT_ACCESSED_SHIFT 5
>  #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
> -#define PT_DIRTY_MASK (1ULL << 6)
> +#define PT_DIRTY_SHIFT 6
> +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
>  #define PT_PAGE_SIZE_MASK (1ULL << 7)
>  #define PT_PAT_MASK (1ULL << 7)
>  #define PT_GLOBAL_MASK (1ULL << 8)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d48ec60..b407a69 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
>  		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
>  				VMX_EPT_EXECUTABLE_MASK);
>  		kvm_enable_tdp();
> +		kvm_dirty_update = 0;

Doesn't the above shadow_dirty_mask==0ull tell us this same info?

>  	} else
>  		kvm_disable_tdp();
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 31ebb59..2036bae 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -53,7 +53,7 @@
>  struct kvm;
>  struct kvm_vcpu;
>  extern struct kmem_cache *kvm_vcpu_cache;
> -
> +extern int kvm_dirty_update;
>  /*
>   * It would be nice to use something smarter than a linear search, TBD...
>   * Thankfully we dont expect many devices to register (famous last words :),
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index 1d1b1e1..bd6ba2d 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -24,6 +24,9 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {

Need to add a comment to describe it.  And why is it not next to
test_and_clear_dirty()?  I see how it's used, but seems as if the
test_and_clear_dirty() code could return -1 (as in dirty state unknown)
for the case where it can't track dirty bit and fall back to checksum.

> +	int (*dirty_update)(struct mmu_notifier *mn,
> +			     struct mm_struct *mm);
> +
>  	/*
>  	 * Called either by mmu_notifier_unregister or when the mm is
>  	 * being destroyed by exit_mmap, always before all pages are
> @@ -72,6 +75,16 @@ struct mmu_notifier_ops {
>  			  unsigned long address);
>  
>  	/*
> +	 * clear_flush_dirty is called after the VM is
> +	 * test-and-clearing the dirty/modified bitflag in the
> +	 * pte. This way the VM will provide proper volatile page
> +	 * testing to ksm.
> +	 */
> +	int (*test_and_clear_dirty)(struct mmu_notifier *mn,
> +				    struct mm_struct *mm,
> +				    unsigned long address);
> +
> +	/*
>  	 * change_pte is called in cases that pte mapping to page is changed:
>  	 * for example, when ksm remaps pte to point to a new shared page.
>  	 */
> @@ -170,11 +183,14 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
>  extern void mmu_notifier_unregister(struct mmu_notifier *mn,
>  				    struct mm_struct *mm);
>  extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
> +extern int __mmu_notifier_dirty_update(struct mm_struct *mm);
>  extern void __mmu_notifier_release(struct mm_struct *mm);
>  extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
>  					  unsigned long address);
>  extern int __mmu_notifier_test_young(struct mm_struct *mm,
>  				     unsigned long address);
> +extern int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
> +					       unsigned long address);
>  extern void __mmu_notifier_change_pte(struct mm_struct *mm,
>  				      unsigned long address, pte_t pte);
>  extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
> @@ -184,6 +200,19 @@ extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
>  extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
>  				  unsigned long start, unsigned long end);
>  
> +/*
> + * For ksm to make use of dirty bit, it wants to make sure that the dirty bits
> + * in sptes really carry the dirty information. Currently only intel EPT is
> + * not for ksm dirty bit tracking.
> + */
> +static inline int mmu_notifier_dirty_update(struct mm_struct *mm)
> +{
> +	if (mm_has_notifiers(mm))
> +		return __mmu_notifier_dirty_update(mm);
> +

No need for extra newline.

> +	return 1;
> +}
> +

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 96ebc06..22967c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -78,6 +78,8 @@ static atomic_t hardware_enable_failed;
>  struct kmem_cache *kvm_vcpu_cache;
>  EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
>  
> +int kvm_dirty_update = 1;
> +
>  static __read_mostly struct preempt_ops kvm_preempt_ops;
>  
>  struct dentry *kvm_debugfs_dir;
> @@ -398,6 +400,23 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
>  	return young;
>  }
>  
> +/* Caller should SetPageDirty(), no need to flush tlb */
> +static int kvm_mmu_notifier_test_and_clear_dirty(struct mmu_notifier *mn,
> +						 struct mm_struct *mm,
> +						 unsigned long address)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	int dirty, idx;

Perhaps something like:

	if (!shadow_dirty_mask)
		return -1;

And adjust caller logic accordingly?

> +     idx = srcu_read_lock(&kvm->srcu);
> +     spin_lock(&kvm->mmu_lock);
> +     dirty = kvm_test_and_clear_dirty_hva(kvm, address);
> +     spin_unlock(&kvm->mmu_lock);
> +     srcu_read_unlock(&kvm->srcu, idx);
> +
> +     return dirty;
> +}
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 22, 2011, 4:43 a.m. UTC | #2
On Wednesday 22 June 2011 08:21:23 Chris Wright wrote:
> * Nai Xia (nai.xia@gmail.com) wrote:
> > Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
> > and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
> > significant performance gain in volatile pages scanning in KSM.
> > Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
> > enabled to indicate that the dirty bits of underlying sptes are not updated by
> > hardware.
> 
> Did you test with each of EPT, NPT and shadow?

I tested in EPT and pure softmmu. I have no NPT box and Izik told me that he 
did not have one either, so help me ... :D

> 
> > Signed-off-by: Nai Xia <nai.xia@gmail.com>
> > Acked-by: Izik Eidus <izik.eidus@ravellosystems.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |    1 +
> >  arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
> >  arch/x86/kvm/mmu.h              |    3 +-
> >  arch/x86/kvm/vmx.c              |    1 +
> >  include/linux/kvm_host.h        |    2 +-
> >  include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
> >  mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
> >  virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
> >  8 files changed, 149 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index d2ac8e2..f0d7aa0 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -848,6 +848,7 @@ extern bool kvm_rebooting;
> >  int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
> >  int kvm_age_hva(struct kvm *kvm, unsigned long hva);
> >  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
> > +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
> >  void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
> >  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
> >  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index aee3862..a5a0c51 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -979,6 +979,37 @@ out:
> >  	return young;
> >  }
> >  
> > +/*
> > + * Caller is supposed to SetPageDirty(), it's not done inside this.
> > + */
> > +static
> > +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
> > +				   unsigned long data)
> > +{
> > +	u64 *spte;
> > +	int dirty = 0;
> > +
> > +	if (!shadow_dirty_mask) {
> > +		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
> > +		goto out;
> > +	}
> 
> This should never fire with the dirty_update() notifier test, right?
> And that means that this whole optimization is for the shadow mmu case,
> arguably the legacy case.

Yes, right. Actually I wrote this for potential abuse of this interface
since its name only does not suggest this. It can be a comment to save
some .text allocation and to compete with the "10k/3lines optimization"
in the list :P

> 
> > +
> > +	spte = rmap_next(kvm, rmapp, NULL);
> > +	while (spte) {
> > +		int _dirty;
> > +		u64 _spte = *spte;
> > +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> > +		_dirty = _spte & PT_DIRTY_MASK;
> > +		if (_dirty) {
> > +			dirty = 1;
> > +			clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
> 
> Is this sufficient (not losing dirty state ever)?

This does lose some dirty state. Not flushing TLB may prevent CPU update
the dirty bit back to spte(I referred the Intel's manual x86 does not update 
in this case). But we(Izik & me) think it ok, because it seems currently the 
only user of dirty bit information is KSM. It's not critical to lose some 
information. And if we do found problem with it in the future, we can add the
flushing. How do you think?

> 
> > +		}
> > +		spte = rmap_next(kvm, rmapp, spte);
> > +	}
> > +out:
> > +	return dirty;
> > +}
> > +
> >  #define RMAP_RECYCLE_THRESHOLD 1000
> >  
> >  static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> > @@ -1004,6 +1035,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
> >  	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
> >  
> >  
> > +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva)
> > +{
> > +	return kvm_handle_hva(kvm, hva, 0, kvm_test_and_clear_dirty_rmapp);
> > +}
> > +
> >  #ifdef MMU_DEBUG
> >  static int is_empty_shadow_page(u64 *spt)
> >  {
> > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > index 7086ca8..b8d01c3 100644
> > --- a/arch/x86/kvm/mmu.h
> > +++ b/arch/x86/kvm/mmu.h
> > @@ -18,7 +18,8 @@
> >  #define PT_PCD_MASK (1ULL << 4)
> >  #define PT_ACCESSED_SHIFT 5
> >  #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
> > -#define PT_DIRTY_MASK (1ULL << 6)
> > +#define PT_DIRTY_SHIFT 6
> > +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
> >  #define PT_PAGE_SIZE_MASK (1ULL << 7)
> >  #define PT_PAT_MASK (1ULL << 7)
> >  #define PT_GLOBAL_MASK (1ULL << 8)
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index d48ec60..b407a69 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
> >  		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
> >  				VMX_EPT_EXECUTABLE_MASK);
> >  		kvm_enable_tdp();
> > +		kvm_dirty_update = 0;
> 
> Doesn't the above shadow_dirty_mask==0ull tell us this same info?

Yes, it's nasty. I am not sure about making shadow_dirty_mask global or not
actually since all other similar state var are all static. 

> 
> >  	} else
> >  		kvm_disable_tdp();
> >  
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 31ebb59..2036bae 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -53,7 +53,7 @@
> >  struct kvm;
> >  struct kvm_vcpu;
> >  extern struct kmem_cache *kvm_vcpu_cache;
> > -
> > +extern int kvm_dirty_update;
> >  /*
> >   * It would be nice to use something smarter than a linear search, TBD...
> >   * Thankfully we dont expect many devices to register (famous last words :),
> > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> > index 1d1b1e1..bd6ba2d 100644
> > --- a/include/linux/mmu_notifier.h
> > +++ b/include/linux/mmu_notifier.h
> > @@ -24,6 +24,9 @@ struct mmu_notifier_mm {
> >  };
> >  
> >  struct mmu_notifier_ops {
> 
> Need to add a comment to describe it.  And why is it not next to
> test_and_clear_dirty()?  I see how it's used, but seems as if the
> test_and_clear_dirty() code could return -1 (as in dirty state unknown)
> for the case where it can't track dirty bit and fall back to checksum.

Actually I did consider this option. But I thought it's weird to test
a bit as its name suggests and return -1 as a result. Should it be the 
first one in human history to do so ? :D

Thanks,
Nai

> 
> > +	int (*dirty_update)(struct mmu_notifier *mn,
> > +			     struct mm_struct *mm);
> > +
> >  	/*
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus June 22, 2011, 6:15 a.m. UTC | #3
On 6/22/2011 3:21 AM, Chris Wright wrote:
> * Nai Xia (nai.xia@gmail.com) wrote:
>> Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
>> and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
>> significant performance gain in volatile pages scanning in KSM.
>> Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
>> enabled to indicate that the dirty bits of underlying sptes are not updated by
>> hardware.
> Did you test with each of EPT, NPT and shadow?
>
>> Signed-off-by: Nai Xia<nai.xia@gmail.com>
>> Acked-by: Izik Eidus<izik.eidus@ravellosystems.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h |    1 +
>>   arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
>>   arch/x86/kvm/mmu.h              |    3 +-
>>   arch/x86/kvm/vmx.c              |    1 +
>>   include/linux/kvm_host.h        |    2 +-
>>   include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
>>   mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
>>   virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
>>   8 files changed, 149 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index d2ac8e2..f0d7aa0 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -848,6 +848,7 @@ extern bool kvm_rebooting;
>>   int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
>>   int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>>   int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
>> +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
>>   void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
>>   int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
>>   int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index aee3862..a5a0c51 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -979,6 +979,37 @@ out:
>>   	return young;
>>   }
>>
>> +/*
>> + * Caller is supposed to SetPageDirty(), it's not done inside this.
>> + */
>> +static
>> +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
>> +				   unsigned long data)
>> +{
>> +	u64 *spte;
>> +	int dirty = 0;
>> +
>> +	if (!shadow_dirty_mask) {
>> +		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
>> +		goto out;
>> +	}
> This should never fire with the dirty_update() notifier test, right?
> And that means that this whole optimization is for the shadow mmu case,
> arguably the legacy case.
>

Hi Chris,
AMD npt does track the dirty bit in the nested page tables,
so the shadow_dirty_mask should not be 0 in that case...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 22, 2011, 6:38 a.m. UTC | #4
On Wednesday 22 June 2011 14:15:51 Izik Eidus wrote:
> On 6/22/2011 3:21 AM, Chris Wright wrote:
> > * Nai Xia (nai.xia@gmail.com) wrote:
> >> Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
> >> and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
> >> significant performance gain in volatile pages scanning in KSM.
> >> Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
> >> enabled to indicate that the dirty bits of underlying sptes are not updated by
> >> hardware.
> > Did you test with each of EPT, NPT and shadow?
> >
> >> Signed-off-by: Nai Xia<nai.xia@gmail.com>
> >> Acked-by: Izik Eidus<izik.eidus@ravellosystems.com>
> >> ---
> >>   arch/x86/include/asm/kvm_host.h |    1 +
> >>   arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
> >>   arch/x86/kvm/mmu.h              |    3 +-
> >>   arch/x86/kvm/vmx.c              |    1 +
> >>   include/linux/kvm_host.h        |    2 +-
> >>   include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
> >>   mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
> >>   virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
> >>   8 files changed, 149 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index d2ac8e2..f0d7aa0 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -848,6 +848,7 @@ extern bool kvm_rebooting;
> >>   int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
> >>   int kvm_age_hva(struct kvm *kvm, unsigned long hva);
> >>   int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
> >> +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
> >>   void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
> >>   int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
> >>   int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index aee3862..a5a0c51 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -979,6 +979,37 @@ out:
> >>   	return young;
> >>   }
> >>
> >> +/*
> >> + * Caller is supposed to SetPageDirty(), it's not done inside this.
> >> + */
> >> +static
> >> +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
> >> +				   unsigned long data)
> >> +{
> >> +	u64 *spte;
> >> +	int dirty = 0;
> >> +
> >> +	if (!shadow_dirty_mask) {
> >> +		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
> >> +		goto out;
> >> +	}
> > This should never fire with the dirty_update() notifier test, right?
> > And that means that this whole optimization is for the shadow mmu case,
> > arguably the legacy case.
> >
> 
> Hi Chris,
> AMD npt does track the dirty bit in the nested page tables,
> so the shadow_dirty_mask should not be 0 in that case...
> 
Hi Izik, 
I think he meant that if the caller is doing right && (!shadow_dirty_mask), 
the kvm_test_and_clear_dirty_rmapp() will never be called at all. So 
this test inside kvm_test_and_clear_dirty_rmapp() is useless...as I said
I added this test in any case of this interface abused by others, just like
a softer BUG_ON() --- dirty bit is not that critical to bump into BUG().



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 22, 2011, 10:43 a.m. UTC | #5
On 06/21/2011 04:32 PM, Nai Xia wrote:
> Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update()
> and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings
> significant performance gain in volatile pages scanning in KSM.
> Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
> enabled to indicate that the dirty bits of underlying sptes are not updated by
> hardware.
>


Can you quantify the performance gains?

> +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
> +				   unsigned long data)
> +{
> +	u64 *spte;
> +	int dirty = 0;
> +
> +	if (!shadow_dirty_mask) {
> +		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
> +		goto out;
> +	}
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _dirty;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte&  PT_PRESENT_MASK));
> +		_dirty = _spte&  PT_DIRTY_MASK;
> +		if (_dirty) {
> +			dirty = 1;
> +			clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
> +		}

Racy.  Also, needs a tlb flush eventually.

> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +out:
> +	return dirty;
> +}
> +
>   #define RMAP_RECYCLE_THRESHOLD 1000
>
>
>   struct mmu_notifier_ops {
> +	int (*dirty_update)(struct mmu_notifier *mn,
> +			     struct mm_struct *mm);
> +

I prefer to have test_and_clear_dirty() always return 1 in this case (if 
the spte is writeable), and drop this callback.
> +int __mmu_notifier_dirty_update(struct mm_struct *mm)
> +{
> +	struct mmu_notifier *mn;
> +	struct hlist_node *n;
> +	int dirty_update = 0;
> +
> +	rcu_read_lock();
> +	hlist_for_each_entry_rcu(mn, n,&mm->mmu_notifier_mm->list, hlist) {
> +		if (mn->ops->dirty_update)
> +			dirty_update |= mn->ops->dirty_update(mn, mm);
> +	}
> +	rcu_read_unlock();
> +

Should it not be &= instead?

> +	return dirty_update;
> +}
> +
>   /*
>    * This function can't run concurrently against mmu_notifier_register
>    * because mm->mm_users>  0 during mmu_notifier_register and exit_mmap
Izik Eidus June 22, 2011, 11:05 a.m. UTC | #6
On 6/22/2011 1:43 PM, Avi Kivity wrote:
> On 06/21/2011 04:32 PM, Nai Xia wrote:
>> Introduced kvm_mmu_notifier_test_and_clear_dirty(), 
>> kvm_mmu_notifier_dirty_update()
>> and their mmu_notifier interfaces to support KSM dirty bit tracking, 
>> which brings
>> significant performance gain in volatile pages scanning in KSM.
>> Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if 
>> intel EPT is
>> enabled to indicate that the dirty bits of underlying sptes are not 
>> updated by
>> hardware.
>>
>
>
> Can you quantify the performance gains?
>
>> +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long 
>> *rmapp,
>> +                   unsigned long data)
>> +{
>> +    u64 *spte;
>> +    int dirty = 0;
>> +
>> +    if (!shadow_dirty_mask) {
>> +        WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
>> +        goto out;
>> +    }
>> +
>> +    spte = rmap_next(kvm, rmapp, NULL);
>> +    while (spte) {
>> +        int _dirty;
>> +        u64 _spte = *spte;
>> +        BUG_ON(!(_spte&  PT_PRESENT_MASK));
>> +        _dirty = _spte&  PT_DIRTY_MASK;
>> +        if (_dirty) {
>> +            dirty = 1;
>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
>> +        }
>
> Racy.  Also, needs a tlb flush eventually.

Hi, one of the issues is that the whole point of this patch is not do 
tlb flush eventually,
But I see your point, because other users will not expect such behavior, 
so maybe there is need into a parameter
flush_tlb=?, or add another mmu notifier call?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 22, 2011, 11:10 a.m. UTC | #7
On 06/22/2011 02:05 PM, Izik Eidus wrote:
>>> +    spte = rmap_next(kvm, rmapp, NULL);
>>> +    while (spte) {
>>> +        int _dirty;
>>> +        u64 _spte = *spte;
>>> +        BUG_ON(!(_spte&  PT_PRESENT_MASK));
>>> +        _dirty = _spte&  PT_DIRTY_MASK;
>>> +        if (_dirty) {
>>> +            dirty = 1;
>>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
>>> +        }
>>
>> Racy.  Also, needs a tlb flush eventually.
> +
>
> Hi, one of the issues is that the whole point of this patch is not do 
> tlb flush eventually,
> But I see your point, because other users will not expect such 
> behavior, so maybe there is need into a parameter
> flush_tlb=?, or add another mmu notifier call?
>

If you don't flush the tlb, a subsequent write will not see that spte.d 
is clear and the write will happen.  So you'll see the page as clean 
even though it's dirty.  That's not acceptable.
Izik Eidus June 22, 2011, 11:19 a.m. UTC | #8
On 6/22/2011 2:10 PM, Avi Kivity wrote:
> On 06/22/2011 02:05 PM, Izik Eidus wrote:
>>>> +    spte = rmap_next(kvm, rmapp, NULL);
>>>> +    while (spte) {
>>>> +        int _dirty;
>>>> +        u64 _spte = *spte;
>>>> +        BUG_ON(!(_spte&  PT_PRESENT_MASK));
>>>> +        _dirty = _spte&  PT_DIRTY_MASK;
>>>> +        if (_dirty) {
>>>> +            dirty = 1;
>>>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
>>>> +        }
>>>
>>> Racy.  Also, needs a tlb flush eventually.
>> +
>>
>> Hi, one of the issues is that the whole point of this patch is not do 
>> tlb flush eventually,
>> But I see your point, because other users will not expect such 
>> behavior, so maybe there is need into a parameter
>> flush_tlb=?, or add another mmu notifier call?
>>
>
> If you don't flush the tlb, a subsequent write will not see that 
> spte.d is clear and the write will happen.  So you'll see the page as 
> clean even though it's dirty.  That's not acceptable.
>

Yes, but this is exactly what we want from this use case:
Right now ksm calculate the page hash to see if it was changed, the idea 
behind this patch is to use the dirty bit instead,
however the guest might not really like the fact that we will flush its 
tlb over and over again, specially in periodically scan like ksm does.

So what we say here is: it is better to have little junk in the unstable 
tree that get flushed eventualy anyway, instead of make the guest slower....
this race is something that does not reflect accurate of ksm anyway due 
to the full memcmp that we will eventualy perform...

Ofcurse we trust that in most cases, beacuse it take ksm to get into a 
random virtual address in real systems few minutes, there will be 
already tlb flush performed.

What you think about having 2 calls: one that does the expected behivor 
and does flush the tlb, and one that clearly say it doesnt flush the tlb
and expline its use case for ksm?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 22, 2011, 11:24 a.m. UTC | #9
On 06/22/2011 02:19 PM, Izik Eidus wrote:
> On 6/22/2011 2:10 PM, Avi Kivity wrote:
>> On 06/22/2011 02:05 PM, Izik Eidus wrote:
>>>>> +    spte = rmap_next(kvm, rmapp, NULL);
>>>>> +    while (spte) {
>>>>> +        int _dirty;
>>>>> +        u64 _spte = *spte;
>>>>> +        BUG_ON(!(_spte&  PT_PRESENT_MASK));
>>>>> +        _dirty = _spte&  PT_DIRTY_MASK;
>>>>> +        if (_dirty) {
>>>>> +            dirty = 1;
>>>>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
>>>>> +        }
>>>>
>>>> Racy.  Also, needs a tlb flush eventually.
>>> +
>>>
>>> Hi, one of the issues is that the whole point of this patch is not 
>>> do tlb flush eventually,
>>> But I see your point, because other users will not expect such 
>>> behavior, so maybe there is need into a parameter
>>> flush_tlb=?, or add another mmu notifier call?
>>>
>>
>> If you don't flush the tlb, a subsequent write will not see that 
>> spte.d is clear and the write will happen.  So you'll see the page as 
>> clean even though it's dirty.  That's not acceptable.
>>
>
> Yes, but this is exactly what we want from this use case:
> Right now ksm calculate the page hash to see if it was changed, the 
> idea behind this patch is to use the dirty bit instead,
> however the guest might not really like the fact that we will flush 
> its tlb over and over again, specially in periodically scan like ksm 
> does.

I see.

>
> So what we say here is: it is better to have little junk in the 
> unstable tree that get flushed eventualy anyway, instead of make the 
> guest slower....
> this race is something that does not reflect accurate of ksm anyway 
> due to the full memcmp that we will eventualy perform...
>
> Ofcurse we trust that in most cases, beacuse it take ksm to get into a 
> random virtual address in real systems few minutes, there will be 
> already tlb flush performed.
>
> What you think about having 2 calls: one that does the expected 
> behivor and does flush the tlb, and one that clearly say it doesnt 
> flush the tlb
> and expline its use case for ksm?

Yes.  And if the unstable/fast callback is not provided, have the common 
code fall back to the stable/slow callback instead.

Or have a parameter that allows inaccurate results to be returned more 
quickly.
Avi Kivity June 22, 2011, 11:28 a.m. UTC | #10
On 06/22/2011 02:24 PM, Avi Kivity wrote:
> On 06/22/2011 02:19 PM, Izik Eidus wrote:
>> On 6/22/2011 2:10 PM, Avi Kivity wrote:
>>> On 06/22/2011 02:05 PM, Izik Eidus wrote:
>>>>>> +    spte = rmap_next(kvm, rmapp, NULL);
>>>>>> +    while (spte) {
>>>>>> +        int _dirty;
>>>>>> +        u64 _spte = *spte;
>>>>>> +        BUG_ON(!(_spte&  PT_PRESENT_MASK));
>>>>>> +        _dirty = _spte&  PT_DIRTY_MASK;
>>>>>> +        if (_dirty) {
>>>>>> +            dirty = 1;
>>>>>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
>>>>>> +        }
>>>>>
>>>>> Racy.  Also, needs a tlb flush eventually.
>>>> +
>>>>
>>>> Hi, one of the issues is that the whole point of this patch is not 
>>>> do tlb flush eventually,
>>>> But I see your point, because other users will not expect such 
>>>> behavior, so maybe there is need into a parameter
>>>> flush_tlb=?, or add another mmu notifier call?
>>>>
>>>
>>> If you don't flush the tlb, a subsequent write will not see that 
>>> spte.d is clear and the write will happen.  So you'll see the page 
>>> as clean even though it's dirty.  That's not acceptable.
>>>
>>
>> Yes, but this is exactly what we want from this use case:
>> Right now ksm calculate the page hash to see if it was changed, the 
>> idea behind this patch is to use the dirty bit instead,
>> however the guest might not really like the fact that we will flush 
>> its tlb over and over again, specially in periodically scan like ksm 
>> does.
>
> I see.

Actually, this is dangerous.  If we use the dirty bit for other things, 
we will get data corruption.

For example we might want to map clean host pages as writeable-clean in 
the spte on a read fault so that we don't get a page fault when they get 
eventually written.
Avi Kivity June 22, 2011, 11:31 a.m. UTC | #11
On 06/22/2011 02:28 PM, Avi Kivity wrote:
>
> Actually, this is dangerous.  If we use the dirty bit for other 
> things, we will get data corruption.
>
> For example we might want to map clean host pages as writeable-clean 
> in the spte on a read fault so that we don't get a page fault when 
> they get eventually written.
>

Another example - we can use the dirty bit for dirty page loggging.

So I think we can get away with a conditional tlb flush - only flush if 
the page was dirty.  That should be rare after the first pass, at least 
with small pages.
Nai Xia June 22, 2011, 11:33 a.m. UTC | #12
On Wednesday 22 June 2011 19:28:08 Avi Kivity wrote:
> On 06/22/2011 02:24 PM, Avi Kivity wrote:
> > On 06/22/2011 02:19 PM, Izik Eidus wrote:
> >> On 6/22/2011 2:10 PM, Avi Kivity wrote:
> >>> On 06/22/2011 02:05 PM, Izik Eidus wrote:
> >>>>>> +    spte = rmap_next(kvm, rmapp, NULL);
> >>>>>> +    while (spte) {
> >>>>>> +        int _dirty;
> >>>>>> +        u64 _spte = *spte;
> >>>>>> +        BUG_ON(!(_spte&  PT_PRESENT_MASK));
> >>>>>> +        _dirty = _spte&  PT_DIRTY_MASK;
> >>>>>> +        if (_dirty) {
> >>>>>> +            dirty = 1;
> >>>>>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
> >>>>>> +        }
> >>>>>
> >>>>> Racy.  Also, needs a tlb flush eventually.
> >>>> +
> >>>>
> >>>> Hi, one of the issues is that the whole point of this patch is not 
> >>>> do tlb flush eventually,
> >>>> But I see your point, because other users will not expect such 
> >>>> behavior, so maybe there is need into a parameter
> >>>> flush_tlb=?, or add another mmu notifier call?
> >>>>
> >>>
> >>> If you don't flush the tlb, a subsequent write will not see that 
> >>> spte.d is clear and the write will happen.  So you'll see the page 
> >>> as clean even though it's dirty.  That's not acceptable.
> >>>
> >>
> >> Yes, but this is exactly what we want from this use case:
> >> Right now ksm calculate the page hash to see if it was changed, the 
> >> idea behind this patch is to use the dirty bit instead,
> >> however the guest might not really like the fact that we will flush 
> >> its tlb over and over again, specially in periodically scan like ksm 
> >> does.
> >
> > I see.
> 
> Actually, this is dangerous.  If we use the dirty bit for other things, 
> we will get data corruption.

Yeah,yeah, I actually clarified in a reply letter to Chris about his similar
concern that we are currently the _only_ user. :)
We can add the flushing when someone else should rely on this bit.

> 
> For example we might want to map clean host pages as writeable-clean in 
> the spte on a read fault so that we don't get a page fault when they get 
> eventually written.
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus June 22, 2011, 11:39 a.m. UTC | #13
On 6/22/2011 2:33 PM, Nai Xia wrote:
> On Wednesday 22 June 2011 19:28:08 Avi Kivity wrote:
>> On 06/22/2011 02:24 PM, Avi Kivity wrote:
>>> On 06/22/2011 02:19 PM, Izik Eidus wrote:
>>>> On 6/22/2011 2:10 PM, Avi Kivity wrote:
>>>>> On 06/22/2011 02:05 PM, Izik Eidus wrote:
>>>>>>>> +    spte = rmap_next(kvm, rmapp, NULL);
>>>>>>>> +    while (spte) {
>>>>>>>> +        int _dirty;
>>>>>>>> +        u64 _spte = *spte;
>>>>>>>> +        BUG_ON(!(_spte&   PT_PRESENT_MASK));
>>>>>>>> +        _dirty = _spte&   PT_DIRTY_MASK;
>>>>>>>> +        if (_dirty) {
>>>>>>>> +            dirty = 1;
>>>>>>>> +            clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
>>>>>>>> +        }
>>>>>>> Racy.  Also, needs a tlb flush eventually.
>>>>>> +
>>>>>>
>>>>>> Hi, one of the issues is that the whole point of this patch is not
>>>>>> do tlb flush eventually,
>>>>>> But I see your point, because other users will not expect such
>>>>>> behavior, so maybe there is need into a parameter
>>>>>> flush_tlb=?, or add another mmu notifier call?
>>>>>>
>>>>> If you don't flush the tlb, a subsequent write will not see that
>>>>> spte.d is clear and the write will happen.  So you'll see the page
>>>>> as clean even though it's dirty.  That's not acceptable.
>>>>>
>>>> Yes, but this is exactly what we want from this use case:
>>>> Right now ksm calculate the page hash to see if it was changed, the
>>>> idea behind this patch is to use the dirty bit instead,
>>>> however the guest might not really like the fact that we will flush
>>>> its tlb over and over again, specially in periodically scan like ksm
>>>> does.
>>> I see.
>> Actually, this is dangerous.  If we use the dirty bit for other things,
>> we will get data corruption.
> Yeah,yeah, I actually clarified in a reply letter to Chris about his similar
> concern that we are currently the _only_ user. :)
> We can add the flushing when someone else should rely on this bit.
>

I suggest to add the flushing when someone else will use it as well

Btw I don`t think this whole optimization is worth for kvm guests in 
case that tlb flush must be perform,
in machine with alot of cpus, it much better ksm will burn one cpu 
usage, instead of slowering all the others...
So while this patch will really make ksm look faster, the whole system 
will be slower...

So in case you don`t want to add the flushing when someone else will 
rely on it,
it will be better to use the dirty tick just for userspace applications 
and not for kvm guests..
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli June 22, 2011, 3:03 p.m. UTC | #14
On Tue, Jun 21, 2011 at 09:32:39PM +0800, Nai Xia wrote:
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d48ec60..b407a69 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
>  		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
>  				VMX_EPT_EXECUTABLE_MASK);
>  		kvm_enable_tdp();
> +		kvm_dirty_update = 0;
>  	} else
>  		kvm_disable_tdp();
>  

Why not return !shadow_dirty_mask instead of adding a new var?

>  struct mmu_notifier_ops {
> +	int (*dirty_update)(struct mmu_notifier *mn,
> +			     struct mm_struct *mm);
> +

Needs some docu.

I think dirty_update isn't self explanatory name. I think
"has_test_and_clear_dirty" would be better.

If we don't flush the smp tlb don't we risk that we'll insert pages in
the unstable tree that are volatile just because the dirty bit didn't
get set again on the spte?

The first patch I guess it's a sign of hugetlbfs going a little over
the edge in trying to mix with the core VM... Passing that parameter
&need_pte_unmap all over the place not so nice, maybe it'd be possible
to fix within hugetlbfs to use a different method to walk the hugetlb
vmas. I'd prefer that if possible.

Thanks,
Andrea
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus June 22, 2011, 3:19 p.m. UTC | #15
> If we don't flush the smp tlb don't we risk that we'll insert pages in
> the unstable tree that are volatile just because the dirty bit didn't
> get set again on the spte?

Yes, this is the trade off we take, the unstable tree will be flushed 
anyway -
so this is nothing that won`t be recovered very soon after it happen...

and most of the chances the tlb will be flushed before ksm get there anyway
(specially for heavily modified page, that we don`t want in the unstable 
tree)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel June 22, 2011, 3:39 p.m. UTC | #16
On 06/22/2011 07:19 AM, Izik Eidus wrote:

> So what we say here is: it is better to have little junk in the unstable
> tree that get flushed eventualy anyway, instead of make the guest
> slower....
> this race is something that does not reflect accurate of ksm anyway due
> to the full memcmp that we will eventualy perform...

With 2MB pages, I am not convinced they will get "flushed eventually",
because there is a good chance at least one of the 4kB pages inside
a 2MB page is in active use at all times.

I worry that the proposed changes may end up effectively preventing
KSM from scanning inside 2MB pages, when even one 4kB page inside
is in active use.  This could mean increased swapping on systems
that run low on memory, which can be a much larger performance penalty
than ksmd CPU use.

We need to scan inside 2MB pages when memory runs low, regardless
of the accessed or dirty bits.
Chris Wright June 22, 2011, 3:46 p.m. UTC | #17
* Izik Eidus (izik.eidus@ravellosystems.com) wrote:
> On 6/22/2011 3:21 AM, Chris Wright wrote:
> >* Nai Xia (nai.xia@gmail.com) wrote:
> >>+	if (!shadow_dirty_mask) {
> >>+		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
> >>+		goto out;
> >>+	}
> >This should never fire with the dirty_update() notifier test, right?
> >And that means that this whole optimization is for the shadow mmu case,
> >arguably the legacy case.
> 
> Hi Chris,
> AMD npt does track the dirty bit in the nested page tables,
> so the shadow_dirty_mask should not be 0 in that case...

Yeah, momentary lapse... ;)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli June 22, 2011, 4:55 p.m. UTC | #18
On Wed, Jun 22, 2011 at 11:39:40AM -0400, Rik van Riel wrote:
> On 06/22/2011 07:19 AM, Izik Eidus wrote:
> 
> > So what we say here is: it is better to have little junk in the unstable
> > tree that get flushed eventualy anyway, instead of make the guest
> > slower....
> > this race is something that does not reflect accurate of ksm anyway due
> > to the full memcmp that we will eventualy perform...
> 
> With 2MB pages, I am not convinced they will get "flushed eventually",
> because there is a good chance at least one of the 4kB pages inside
> a 2MB page is in active use at all times.
> 
> I worry that the proposed changes may end up effectively preventing
> KSM from scanning inside 2MB pages, when even one 4kB page inside
> is in active use.  This could mean increased swapping on systems
> that run low on memory, which can be a much larger performance penalty
> than ksmd CPU use.
> 
> We need to scan inside 2MB pages when memory runs low, regardless
> of the accessed or dirty bits.

I guess we could fallback to the cksum when a THP is encountered
(repeating the test_and_clear_dirty also wouldn't give the expected
result if it's repeated on the same hugepmd for the next 4k virtual
address candidate for unstable tree insertion, so it'd need special
handling during the virtual walk anyway).

So it's getting a little hairy, skip on THP, skip on EPT, then I
wonder what is the common case that would be left using it...

Or we could evaluate with statistic how many less pages are inserted
into the unstable tree using the 2m dirty bit but clearly it'd be less
reliable, the algorithm really is meant to track the volatility of
what is later merged, not of a bigger chunk with unrelated data in it.

On a side note, khugepaged should also be changed to preserve the
dirty bit if at least one dirty bit of the ptes is dirty (currently
the hugepmd is always created dirty, it can never happen for an
hugepmd to be clean today so it wasn't preserved in khugepaged so far).
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 22, 2011, 11:13 p.m. UTC | #19
On Wed, Jun 22, 2011 at 11:39 PM, Rik van Riel <riel@redhat.com> wrote:
> On 06/22/2011 07:19 AM, Izik Eidus wrote:
>
>> So what we say here is: it is better to have little junk in the unstable
>> tree that get flushed eventualy anyway, instead of make the guest
>> slower....
>> this race is something that does not reflect accurate of ksm anyway due
>> to the full memcmp that we will eventualy perform...
>
> With 2MB pages, I am not convinced they will get "flushed eventually",
> because there is a good chance at least one of the 4kB pages inside
> a 2MB page is in active use at all times.
>
> I worry that the proposed changes may end up effectively preventing
> KSM from scanning inside 2MB pages, when even one 4kB page inside
> is in active use.  This could mean increased swapping on systems
> that run low on memory, which can be a much larger performance penalty
> than ksmd CPU use.
>
> We need to scan inside 2MB pages when memory runs low, regardless
> of the accessed or dirty bits.

I agree on this point. Dirty bit , young bit, is by no means accurate. Even
on 4kB pages, there is always a chance that the pte are dirty but the contents
are actually the same. Yeah, the whole optimization contains trade-offs and
trades-offs always have the possibilities to annoy  someone.  Just like
page-bit-relying LRU approximations none of them is perfect too. But I think
it can benefit some people. So maybe we could just provide a generic balanced
solution but provide fine tuning interfaces to make sure tha when it really gets
in the way of someone, he has a way to walk around.
Do you agree on my argument? :-)

>
> --
> All rights reversed
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 22, 2011, 11:19 p.m. UTC | #20
On Wed, Jun 22, 2011 at 11:03 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Tue, Jun 21, 2011 at 09:32:39PM +0800, Nai Xia wrote:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index d48ec60..b407a69 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
>>               kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
>>                               VMX_EPT_EXECUTABLE_MASK);
>>               kvm_enable_tdp();
>> +             kvm_dirty_update = 0;
>>       } else
>>               kvm_disable_tdp();
>>
>
> Why not return !shadow_dirty_mask instead of adding a new var?
>
>>  struct mmu_notifier_ops {
>> +     int (*dirty_update)(struct mmu_notifier *mn,
>> +                          struct mm_struct *mm);
>> +
>
> Needs some docu.
>
> I think dirty_update isn't self explanatory name. I think
> "has_test_and_clear_dirty" would be better.
>
> If we don't flush the smp tlb don't we risk that we'll insert pages in
> the unstable tree that are volatile just because the dirty bit didn't
> get set again on the spte?
>
> The first patch I guess it's a sign of hugetlbfs going a little over
> the edge in trying to mix with the core VM... Passing that parameter
> &need_pte_unmap all over the place not so nice, maybe it'd be possible
> to fix within hugetlbfs to use a different method to walk the hugetlb
> vmas. I'd prefer that if possible.

OK, I'll have a try over other workarounds.
I am not feeling good about need_pte_unmap myself. :-)

Thanks for viewing!

-Nai

>
> Thanks,
> Andrea
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli June 22, 2011, 11:25 p.m. UTC | #21
On Thu, Jun 23, 2011 at 07:13:54AM +0800, Nai Xia wrote:
> I agree on this point. Dirty bit , young bit, is by no means accurate. Even
> on 4kB pages, there is always a chance that the pte are dirty but the contents
> are actually the same. Yeah, the whole optimization contains trade-offs and

Just a side note: the fact the dirty bit would be set even when the
data is the same is actually a pros, not a cons. If the content is the
same but the page was written to, it'd trigger a copy on write short
after merging the page rendering the whole exercise wasteful. The
cksum plays a double role, it both "stabilizes" the unstable tree, so
there's less chance of bad lookups, but it also avoids us to merge
stuff that is written to frequently triggering copy on writes, and the
dirty bit would also catch overwrites with the same data, something
the cksum can't do.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel June 22, 2011, 11:28 p.m. UTC | #22
On 06/22/2011 07:13 PM, Nai Xia wrote:
> On Wed, Jun 22, 2011 at 11:39 PM, Rik van Riel<riel@redhat.com>  wrote:
>> On 06/22/2011 07:19 AM, Izik Eidus wrote:
>>
>>> So what we say here is: it is better to have little junk in the unstable
>>> tree that get flushed eventualy anyway, instead of make the guest
>>> slower....
>>> this race is something that does not reflect accurate of ksm anyway due
>>> to the full memcmp that we will eventualy perform...
>>
>> With 2MB pages, I am not convinced they will get "flushed eventually",
>> because there is a good chance at least one of the 4kB pages inside
>> a 2MB page is in active use at all times.
>>
>> I worry that the proposed changes may end up effectively preventing
>> KSM from scanning inside 2MB pages, when even one 4kB page inside
>> is in active use.  This could mean increased swapping on systems
>> that run low on memory, which can be a much larger performance penalty
>> than ksmd CPU use.
>>
>> We need to scan inside 2MB pages when memory runs low, regardless
>> of the accessed or dirty bits.
>
> I agree on this point. Dirty bit , young bit, is by no means accurate. Even
> on 4kB pages, there is always a chance that the pte are dirty but the contents
> are actually the same. Yeah, the whole optimization contains trade-offs and
> trades-offs always have the possibilities to annoy  someone.  Just like
> page-bit-relying LRU approximations none of them is perfect too. But I think
> it can benefit some people. So maybe we could just provide a generic balanced
> solution but provide fine tuning interfaces to make sure tha when it really gets
> in the way of someone, he has a way to walk around.
> Do you agree on my argument? :-)

That's not an argument.

That is a "if I wave my hands vigorously enough, maybe people
will let my patch in without thinking about what I wrote"
style argument.

I believe your optimization makes sense for 4kB pages, but
is going to be counter-productive for 2MB pages.

Your approach of "make ksmd skip over more pages, so it uses
less CPU" is likely to reduce the effectiveness of ksm by not
sharing some pages.

For 4kB pages that is fine, because you'll get around to them
eventually.

However, the internal use of a 2MB page is likely to be quite
different.  Chances are most 2MB pages will have actively used,
barely used and free pages inside.

You absolutely want ksm to get at the barely used and free
sub-pages.  Having just one actively used 4kB sub-page prevent
ksm from merging any of the other 511 sub-pages is a problem.
Nai Xia June 22, 2011, 11:37 p.m. UTC | #23
On Thu, Jun 23, 2011 at 12:55 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Wed, Jun 22, 2011 at 11:39:40AM -0400, Rik van Riel wrote:
>> On 06/22/2011 07:19 AM, Izik Eidus wrote:
>>
>> > So what we say here is: it is better to have little junk in the unstable
>> > tree that get flushed eventualy anyway, instead of make the guest
>> > slower....
>> > this race is something that does not reflect accurate of ksm anyway due
>> > to the full memcmp that we will eventualy perform...
>>
>> With 2MB pages, I am not convinced they will get "flushed eventually",
>> because there is a good chance at least one of the 4kB pages inside
>> a 2MB page is in active use at all times.
>>
>> I worry that the proposed changes may end up effectively preventing
>> KSM from scanning inside 2MB pages, when even one 4kB page inside
>> is in active use.  This could mean increased swapping on systems
>> that run low on memory, which can be a much larger performance penalty
>> than ksmd CPU use.
>>
>> We need to scan inside 2MB pages when memory runs low, regardless
>> of the accessed or dirty bits.
>
> I guess we could fallback to the cksum when a THP is encountered
> (repeating the test_and_clear_dirty also wouldn't give the expected
> result if it's repeated on the same hugepmd for the next 4k virtual
> address candidate for unstable tree insertion, so it'd need special
> handling during the virtual walk anyway).
>
> So it's getting a little hairy, skip on THP, skip on EPT, then I
> wonder what is the common case that would be left using it...
>
> Or we could evaluate with statistic how many less pages are inserted
> into the unstable tree using the 2m dirty bit but clearly it'd be less
> reliable, the algorithm really is meant to track the volatility of
> what is later merged, not of a bigger chunk with unrelated data in it.

On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
huge pages before their sub pages gets really merged to stable tree.
So when there are many 2MB pages each having a 4kB subpage
changed for all time, this is already a concern for ksmd to judge
if it's worthwhile to split 2MB page and get its sub-pages merged.
I think the policy for ksmd in a system should be "If you cannot do sth good,
at least do nothing evil". So I really don't think we can satisfy _all_ people.
Get a general method and give users one or two knobs to tune it when they
are the corner cases. How do  you think of my proposal ?

>
> On a side note, khugepaged should also be changed to preserve the
> dirty bit if at least one dirty bit of the ptes is dirty (currently
> the hugepmd is always created dirty, it can never happen for an
> hugepmd to be clean today so it wasn't preserved in khugepaged so far).
>

Thanks for the point that out. This is what I have overlooked!

thanks,
Nai
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 22, 2011, 11:42 p.m. UTC | #24
On Wed, Jun 22, 2011 at 11:03 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Tue, Jun 21, 2011 at 09:32:39PM +0800, Nai Xia wrote:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index d48ec60..b407a69 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
>>               kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
>>                               VMX_EPT_EXECUTABLE_MASK);
>>               kvm_enable_tdp();
>> +             kvm_dirty_update = 0;
>>       } else
>>               kvm_disable_tdp();
>>
>
> Why not return !shadow_dirty_mask instead of adding a new var?
>
>>  struct mmu_notifier_ops {
>> +     int (*dirty_update)(struct mmu_notifier *mn,
>> +                          struct mm_struct *mm);
>> +
>
> Needs some docu.

OK. I'll add it.

>
> I think dirty_update isn't self explanatory name. I think
> "has_test_and_clear_dirty" would be better.

Agreed.  So it will be the name in the next version. :)

Thanks,
Nai

>
> If we don't flush the smp tlb don't we risk that we'll insert pages in
> the unstable tree that are volatile just because the dirty bit didn't
> get set again on the spte?
>
> The first patch I guess it's a sign of hugetlbfs going a little over
> the edge in trying to mix with the core VM... Passing that parameter
> &need_pte_unmap all over the place not so nice, maybe it'd be possible
> to fix within hugetlbfs to use a different method to walk the hugetlb
> vmas. I'd prefer that if possible.
>
> Thanks,
> Andrea
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli June 22, 2011, 11:44 p.m. UTC | #25
On Thu, Jun 23, 2011 at 07:19:06AM +0800, Nai Xia wrote:
> OK, I'll have a try over other workarounds.
> I am not feeling good about need_pte_unmap myself. :-)

The usual way is to check VM_HUGETLB in the caller and to call another
function that doesn't kmap. Casting pmd_t to pte_t isn't really nice
(but hey we're also doing that exceptionally in smaps_pte_range for
THP, but it safe there because we're casting the value of the pmd, not
the pointer to the pmd, so the kmap is done by the pte version of the
caller and not done by the pmd version of the caller).

Is it done for migrate? Surely it's not for swapout ;).

> Thanks for viewing!

You're welcome!

JFYI I'll be offline on vacation for a week, starting tomorrow, so if
I don't answer in the next few days that's the reason but I'll follow
the progress in a week.

Thanks!
Andrea
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli June 22, 2011, 11:59 p.m. UTC | #26
On Thu, Jun 23, 2011 at 07:37:47AM +0800, Nai Xia wrote:
> On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
> huge pages before their sub pages gets really merged to stable tree.
> So when there are many 2MB pages each having a 4kB subpage
> changed for all time, this is already a concern for ksmd to judge
> if it's worthwhile to split 2MB page and get its sub-pages merged.

Hmm not sure to follow. KSM memory density with THP on and off should
be identical. The cksum is computed on subpages so the fact the 4k
subpage is actually mapped by a hugepmd is invisible to KSM up to the
point we get a unstable_tree_search_insert/stable_tree_search lookup
succeeding.

> I think the policy for ksmd in a system should be "If you cannot do sth good,
> at least do nothing evil". So I really don't think we can satisfy _all_ people.
> Get a general method and give users one or two knobs to tune it when they
> are the corner cases. How do  you think of my proposal ?

I'm neutral, but if we get two methods for deciding the unstable tree
candidates, the default probably should prioritize on maximum merging
even if it takes more CPU (if one cares about performance of the core
dedicated to ksmd, KSM is likely going to be off or scanning at low
rate in the first place).

> > On a side note, khugepaged should also be changed to preserve the
> > dirty bit if at least one dirty bit of the ptes is dirty (currently
> > the hugepmd is always created dirty, it can never happen for an
> > hugepmd to be clean today so it wasn't preserved in khugepaged so far).
> >
> 
> Thanks for the point that out. This is what I have overlooked!

No prob. And its default scan rate is very slow compared to ksmd so
it was unlikely to generate too many false positive dirty bits even if
you were splitting hugepages through swap.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel June 23, 2011, midnight UTC | #27
On 06/22/2011 07:37 PM, Nai Xia wrote:

> On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
> huge pages before their sub pages gets really merged to stable tree.

Your proposal appears to add a condition that causes ksmd to skip
doing that, which can cause the system to start using swap instead
of sharing memory.

> So when there are many 2MB pages each having a 4kB subpage
> changed for all time, this is already a concern for ksmd to judge
> if it's worthwhile to split 2MB page and get its sub-pages merged.
> I think the policy for ksmd in a system should be "If you cannot do sth good,
> at least do nothing evil". So I really don't think we can satisfy _all_ people.
> Get a general method and give users one or two knobs to tune it when they
> are the corner cases. How do  you think of my proposal ?

I think your proposal makes sense for 4kB pages, but the ksmd
policy for 2MB pages probably needs to be much more aggressive.
Nai Xia June 23, 2011, 12:14 a.m. UTC | #28
On Thu, Jun 23, 2011 at 7:44 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Thu, Jun 23, 2011 at 07:19:06AM +0800, Nai Xia wrote:
>> OK, I'll have a try over other workarounds.
>> I am not feeling good about need_pte_unmap myself. :-)
>
> The usual way is to check VM_HUGETLB in the caller and to call another
> function that doesn't kmap. Casting pmd_t to pte_t isn't really nice
> (but hey we're also doing that exceptionally in smaps_pte_range for
> THP, but it safe there because we're casting the value of the pmd, not
> the pointer to the pmd, so the kmap is done by the pte version of the
> caller and not done by the pmd version of the caller).
>
> Is it done for migrate? Surely it's not for swapout ;).

Thanks for the hint. :-)

You know, another thing I am worried about is that I think I
did make page_check_address()  return a pmd version for skipping the
tail subpages ...
I did detecte a schedule in atomic if I kunmap() the returned value. :-(

>
>> Thanks for viewing!
>
> You're welcome!
>
> JFYI I'll be offline on vacation for a week, starting tomorrow, so if
> I don't answer in the next few days that's the reason but I'll follow
> the progress in a week.

Have a nice vacation man! Enjoy the sunlight, we all have enough
of code in rooms. ;-)


Thanks,
Nai

>
> Thanks!
> Andrea
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 23, 2011, 12:31 a.m. UTC | #29
On Thu, Jun 23, 2011 at 7:59 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Thu, Jun 23, 2011 at 07:37:47AM +0800, Nai Xia wrote:
>> On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
>> huge pages before their sub pages gets really merged to stable tree.
>> So when there are many 2MB pages each having a 4kB subpage
>> changed for all time, this is already a concern for ksmd to judge
>> if it's worthwhile to split 2MB page and get its sub-pages merged.
>
> Hmm not sure to follow. KSM memory density with THP on and off should
> be identical. The cksum is computed on subpages so the fact the 4k
> subpage is actually mapped by a hugepmd is invisible to KSM up to the
> point we get a unstable_tree_search_insert/stable_tree_search lookup
> succeeding.

I agree on your points.

But, I mean splitting the huge page into normal pages when some subpages
need to be merged may increase the TLB lookside timing of CPU and
_might_ hurt the workload ksmd is scanning. If only a small portion of false
negative 2MB pages are really get merged eventually, maybe it's not worthwhile,
right?

But, well, just like Rik said below, yes, ksmd should be more aggressive to
avoid much more time consuming cost for swapping.

>
>> I think the policy for ksmd in a system should be "If you cannot do sth good,
>> at least do nothing evil". So I really don't think we can satisfy _all_ people.
>> Get a general method and give users one or two knobs to tune it when they
>> are the corner cases. How do  you think of my proposal ?
>
> I'm neutral, but if we get two methods for deciding the unstable tree
> candidates, the default probably should prioritize on maximum merging
> even if it takes more CPU (if one cares about performance of the core
> dedicated to ksmd, KSM is likely going to be off or scanning at low
> rate in the first place).

I agree with you here.


thanks,

Nai
>
>> > On a side note, khugepaged should also be changed to preserve the
>> > dirty bit if at least one dirty bit of the ptes is dirty (currently
>> > the hugepmd is always created dirty, it can never happen for an
>> > hugepmd to be clean today so it wasn't preserved in khugepaged so far).
>> >
>>
>> Thanks for the point that out. This is what I have overlooked!
>
> No prob. And its default scan rate is very slow compared to ksmd so
> it was unlikely to generate too many false positive dirty bits even if
> you were splitting hugepages through swap.
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 23, 2011, 12:42 a.m. UTC | #30
On Thu, Jun 23, 2011 at 8:00 AM, Rik van Riel <riel@redhat.com> wrote:
> On 06/22/2011 07:37 PM, Nai Xia wrote:
>
>> On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
>> huge pages before their sub pages gets really merged to stable tree.
>
> Your proposal appears to add a condition that causes ksmd to skip
> doing that, which can cause the system to start using swap instead
> of sharing memory.

Hmm, yes, no swapping. So we should make the checksum default
for huge pages, right?

>
>> So when there are many 2MB pages each having a 4kB subpage
>> changed for all time, this is already a concern for ksmd to judge
>> if it's worthwhile to split 2MB page and get its sub-pages merged.
>> I think the policy for ksmd in a system should be "If you cannot do sth
>> good,
>> at least do nothing evil". So I really don't think we can satisfy _all_
>> people.
>> Get a general method and give users one or two knobs to tune it when they
>> are the corner cases. How do  you think of my proposal ?
>
> I think your proposal makes sense for 4kB pages, but the ksmd
> policy for 2MB pages probably needs to be much more aggressive.

I now agree with you on the whole point. Let's fall back to checksum
Thanks for make my mind clear! :)

And shall we provide a interface to users if he _really_ what to judge the dirty
bit from the pmd level? I think we should agree to one point before I
misunderstand
you and spam you with my next submission :P


And thanks for your time viewing!

-Nai


>
> --
> All rights reversed
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli June 23, 2011, 12:44 a.m. UTC | #31
On Thu, Jun 23, 2011 at 08:31:56AM +0800, Nai Xia wrote:
> On Thu, Jun 23, 2011 at 7:59 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > On Thu, Jun 23, 2011 at 07:37:47AM +0800, Nai Xia wrote:
> >> On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
> >> huge pages before their sub pages gets really merged to stable tree.
> >> So when there are many 2MB pages each having a 4kB subpage
> >> changed for all time, this is already a concern for ksmd to judge
> >> if it's worthwhile to split 2MB page and get its sub-pages merged.
> >
> > Hmm not sure to follow. KSM memory density with THP on and off should
> > be identical. The cksum is computed on subpages so the fact the 4k
> > subpage is actually mapped by a hugepmd is invisible to KSM up to the
> > point we get a unstable_tree_search_insert/stable_tree_search lookup
> > succeeding.
> 
> I agree on your points.
> 
> But, I mean splitting the huge page into normal pages when some subpages
> need to be merged may increase the TLB lookside timing of CPU and
> _might_ hurt the workload ksmd is scanning. If only a small portion of false
> negative 2MB pages are really get merged eventually, maybe it's not worthwhile,
> right?

Yes, there's not threshold to say "only split if we could merge more
than N subpages", 1 subpage match in two different hugepages is enough
to split both and save just 4k but then memory accesses will be slower
for both 2m ranges that have been splitted. But the point is that it
won't be slower than if THP was off in the first place. So in the end
all we gain is 4k saved but we still run faster than THP off, in the
other hugepages that haven't been splitted yet.

> But, well, just like Rik said below, yes, ksmd should be more aggressive to
> avoid much more time consuming cost for swapping.

Correct the above logic also follows the idea to always maximize
memory merging in KSM, which is why we've no threshold to wait N
subpages to be mergeable before we split the hugepage.

I'm unsure if admins in real life would then start to use those
thresholds even if we'd implement them. The current way of enabling
KSM a per-VM (not per-host) basis is pretty simple: the performance
critical VM has KSM off, non-performance critical VM has KSM on and it
prioritizes on memory merging.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 23, 2011, 12:52 a.m. UTC | #32
On Thu, Jun 23, 2011 at 7:28 AM, Rik van Riel <riel@redhat.com> wrote:
> On 06/22/2011 07:13 PM, Nai Xia wrote:
>>
>> On Wed, Jun 22, 2011 at 11:39 PM, Rik van Riel<riel@redhat.com>  wrote:
>>>
>>> On 06/22/2011 07:19 AM, Izik Eidus wrote:
>>>
>>>> So what we say here is: it is better to have little junk in the unstable
>>>> tree that get flushed eventualy anyway, instead of make the guest
>>>> slower....
>>>> this race is something that does not reflect accurate of ksm anyway due
>>>> to the full memcmp that we will eventualy perform...
>>>
>>> With 2MB pages, I am not convinced they will get "flushed eventually",
>>> because there is a good chance at least one of the 4kB pages inside
>>> a 2MB page is in active use at all times.
>>>
>>> I worry that the proposed changes may end up effectively preventing
>>> KSM from scanning inside 2MB pages, when even one 4kB page inside
>>> is in active use.  This could mean increased swapping on systems
>>> that run low on memory, which can be a much larger performance penalty
>>> than ksmd CPU use.
>>>
>>> We need to scan inside 2MB pages when memory runs low, regardless
>>> of the accessed or dirty bits.
>>
>> I agree on this point. Dirty bit , young bit, is by no means accurate.
>> Even
>> on 4kB pages, there is always a chance that the pte are dirty but the
>> contents
>> are actually the same. Yeah, the whole optimization contains trade-offs
>> and
>> trades-offs always have the possibilities to annoy  someone.  Just like
>> page-bit-relying LRU approximations none of them is perfect too. But I
>> think
>> it can benefit some people. So maybe we could just provide a generic
>> balanced
>> solution but provide fine tuning interfaces to make sure tha when it
>> really gets
>> in the way of someone, he has a way to walk around.
>> Do you agree on my argument? :-)
>
> That's not an argument.
>
> That is a "if I wave my hands vigorously enough, maybe people
> will let my patch in without thinking about what I wrote"
> style argument.

Oh, NO, this is not what I meant.
Really sorry if I made myself look so evil...
I actually mean: "Skip or not, we agree on a point that will not
harm most people, and provide another interface to let someon
who _really_ want to take another way."

I am by no means pushing the idea of "skipping" huge pages.
I am just not sure about it and want to get a precise idea from
you. And now I get it.


>
> I believe your optimization makes sense for 4kB pages, but
> is going to be counter-productive for 2MB pages.
>
> Your approach of "make ksmd skip over more pages, so it uses
> less CPU" is likely to reduce the effectiveness of ksm by not
> sharing some pages.
>
> For 4kB pages that is fine, because you'll get around to them
> eventually.
>
> However, the internal use of a 2MB page is likely to be quite
> different.  Chances are most 2MB pages will have actively used,
> barely used and free pages inside.
>
> You absolutely want ksm to get at the barely used and free
> sub-pages.  Having just one actively used 4kB sub-page prevent
> ksm from merging any of the other 511 sub-pages is a problem.

No, no,  I was just not sure about it. I meant we cannot satisfy
all people but I was not sure which one is good for most of them.

Sorry, again, if I didn't make it clear.


Nai

>
> --
> All rights reversed
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 23, 2011, 1:30 a.m. UTC | #33
On Thu, Jun 23, 2011 at 7:25 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Thu, Jun 23, 2011 at 07:13:54AM +0800, Nai Xia wrote:
>> I agree on this point. Dirty bit , young bit, is by no means accurate. Even
>> on 4kB pages, there is always a chance that the pte are dirty but the contents
>> are actually the same. Yeah, the whole optimization contains trade-offs and
>
> Just a side note: the fact the dirty bit would be set even when the
> data is the same is actually a pros, not a cons. If the content is the
> same but the page was written to, it'd trigger a copy on write short
> after merging the page rendering the whole exercise wasteful. The
> cksum plays a double role, it both "stabilizes" the unstable tree, so
> there's less chance of bad lookups, but it also avoids us to merge
> stuff that is written to frequently triggering copy on writes, and the
> dirty bit would also catch overwrites with the same data, something
> the cksum can't do.

Good point. I actually have myself another version of ksm(off topic, but
if you want to take a glance: http://code.google.com/p/uksm/ :-) )
that did do statistics of the ratio of the pages in a VMA that really got COWed.
due to KSM merging on each scan round basis.

It's  complicated to deduce a precise  information only
from the dirty and cksum.


Thanks,
Nai
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nai Xia June 23, 2011, 1:36 a.m. UTC | #34
On Thu, Jun 23, 2011 at 8:44 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Thu, Jun 23, 2011 at 08:31:56AM +0800, Nai Xia wrote:
>> On Thu, Jun 23, 2011 at 7:59 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
>> > On Thu, Jun 23, 2011 at 07:37:47AM +0800, Nai Xia wrote:
>> >> On 2MB pages, I'd like to remind you and Rik that ksmd currently splits
>> >> huge pages before their sub pages gets really merged to stable tree.
>> >> So when there are many 2MB pages each having a 4kB subpage
>> >> changed for all time, this is already a concern for ksmd to judge
>> >> if it's worthwhile to split 2MB page and get its sub-pages merged.
>> >
>> > Hmm not sure to follow. KSM memory density with THP on and off should
>> > be identical. The cksum is computed on subpages so the fact the 4k
>> > subpage is actually mapped by a hugepmd is invisible to KSM up to the
>> > point we get a unstable_tree_search_insert/stable_tree_search lookup
>> > succeeding.
>>
>> I agree on your points.
>>
>> But, I mean splitting the huge page into normal pages when some subpages
>> need to be merged may increase the TLB lookside timing of CPU and
>> _might_ hurt the workload ksmd is scanning. If only a small portion of false
>> negative 2MB pages are really get merged eventually, maybe it's not worthwhile,
>> right?
>
> Yes, there's not threshold to say "only split if we could merge more
> than N subpages", 1 subpage match in two different hugepages is enough
> to split both and save just 4k but then memory accesses will be slower
> for both 2m ranges that have been splitted. But the point is that it
> won't be slower than if THP was off in the first place. So in the end
> all we gain is 4k saved but we still run faster than THP off, in the
> other hugepages that haven't been splitted yet.

Yes, so ksmd is still doing good compared to THP off.
Thanks for making my mind clearer :)

>
>> But, well, just like Rik said below, yes, ksmd should be more aggressive to
>> avoid much more time consuming cost for swapping.
>
> Correct the above logic also follows the idea to always maximize
> memory merging in KSM, which is why we've no threshold to wait N
> subpages to be mergeable before we split the hugepage.
>
> I'm unsure if admins in real life would then start to use those
> thresholds even if we'd implement them. The current way of enabling
> KSM a per-VM (not per-host) basis is pretty simple: the performance
> critical VM has KSM off, non-performance critical VM has KSM on and it
> prioritizes on memory merging.
>
Hmm, yes, you are right.

Thanks,
Nai
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2..f0d7aa0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -848,6 +848,7 @@  extern bool kvm_rebooting;
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee3862..a5a0c51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -979,6 +979,37 @@  out:
 	return young;
 }
 
+/*
+ * Caller is supposed to SetPageDirty(), it's not done inside this.
+ */
+static
+int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
+				   unsigned long data)
+{
+	u64 *spte;
+	int dirty = 0;
+
+	if (!shadow_dirty_mask) {
+		WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
+		goto out;
+	}
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _dirty;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_dirty = _spte & PT_DIRTY_MASK;
+		if (_dirty) {
+			dirty = 1;
+			clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+out:
+	return dirty;
+}
+
 #define RMAP_RECYCLE_THRESHOLD 1000
 
 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1004,6 +1035,11 @@  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
 }
 
+int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, 0, kvm_test_and_clear_dirty_rmapp);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca8..b8d01c3 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,7 +18,8 @@ 
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
 #define PT_PAGE_SIZE_MASK (1ULL << 7)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60..b407a69 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4674,6 +4674,7 @@  static int __init vmx_init(void)
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
 				VMX_EPT_EXECUTABLE_MASK);
 		kvm_enable_tdp();
+		kvm_dirty_update = 0;
 	} else
 		kvm_disable_tdp();
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59..2036bae 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -53,7 +53,7 @@ 
 struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
-
+extern int kvm_dirty_update;
 /*
  * It would be nice to use something smarter than a linear search, TBD...
  * Thankfully we dont expect many devices to register (famous last words :),
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1d1b1e1..bd6ba2d 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -24,6 +24,9 @@  struct mmu_notifier_mm {
 };
 
 struct mmu_notifier_ops {
+	int (*dirty_update)(struct mmu_notifier *mn,
+			     struct mm_struct *mm);
+
 	/*
 	 * Called either by mmu_notifier_unregister or when the mm is
 	 * being destroyed by exit_mmap, always before all pages are
@@ -72,6 +75,16 @@  struct mmu_notifier_ops {
 			  unsigned long address);
 
 	/*
+	 * clear_flush_dirty is called after the VM is
+	 * test-and-clearing the dirty/modified bitflag in the
+	 * pte. This way the VM will provide proper volatile page
+	 * testing to ksm.
+	 */
+	int (*test_and_clear_dirty)(struct mmu_notifier *mn,
+				    struct mm_struct *mm,
+				    unsigned long address);
+
+	/*
 	 * change_pte is called in cases that pte mapping to page is changed:
 	 * for example, when ksm remaps pte to point to a new shared page.
 	 */
@@ -170,11 +183,14 @@  extern int __mmu_notifier_register(struct mmu_notifier *mn,
 extern void mmu_notifier_unregister(struct mmu_notifier *mn,
 				    struct mm_struct *mm);
 extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
+extern int __mmu_notifier_dirty_update(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
+extern int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+					       unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -184,6 +200,19 @@  extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 
+/*
+ * For ksm to make use of dirty bit, it wants to make sure that the dirty bits
+ * in sptes really carry the dirty information. Currently only intel EPT is
+ * not for ksm dirty bit tracking.
+ */
+static inline int mmu_notifier_dirty_update(struct mm_struct *mm)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_dirty_update(mm);
+
+	return 1;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
 	if (mm_has_notifiers(mm))
@@ -206,6 +235,14 @@  static inline int mmu_notifier_test_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+						    unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_test_and_clear_dirty(mm, address);
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
@@ -323,6 +360,11 @@  static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #else /* CONFIG_MMU_NOTIFIER */
 
+static inline int mmu_notifier_dirty_update(struct mm_struct *mm)
+{
+	return 1;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
 }
@@ -339,6 +381,12 @@  static inline int mmu_notifier_test_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+						    unsigned long address)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de..a4a1467 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -18,6 +18,22 @@ 
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+int __mmu_notifier_dirty_update(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int dirty_update = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->dirty_update)
+			dirty_update |= mn->ops->dirty_update(mn, mm);
+	}
+	rcu_read_unlock();
+
+	return dirty_update;
+}
+
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -120,6 +136,23 @@  int __mmu_notifier_test_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+					unsigned long address)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n;
+	int dirty = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->test_and_clear_dirty)
+			dirty |= mn->ops->test_and_clear_dirty(mn, mm, address);
+	}
+	rcu_read_unlock();
+
+	return dirty;
+}
+
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 			       pte_t pte)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 96ebc06..22967c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -78,6 +78,8 @@  static atomic_t hardware_enable_failed;
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 
+int kvm_dirty_update = 1;
+
 static __read_mostly struct preempt_ops kvm_preempt_ops;
 
 struct dentry *kvm_debugfs_dir;
@@ -398,6 +400,23 @@  static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 	return young;
 }
 
+/* Caller should SetPageDirty(), no need to flush tlb */
+static int kvm_mmu_notifier_test_and_clear_dirty(struct mmu_notifier *mn,
+						 struct mm_struct *mm,
+						 unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int dirty, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	dirty = kvm_test_and_clear_dirty_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return dirty;
+}
+
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
@@ -409,14 +428,22 @@  static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
+static int kvm_mmu_notifier_dirty_update(struct mmu_notifier *mn,
+					 struct mm_struct *mm)
+{
+	return kvm_dirty_update;
+}
+
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
 	.test_young		= kvm_mmu_notifier_test_young,
+	.test_and_clear_dirty	= kvm_mmu_notifier_test_and_clear_dirty,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
+	.dirty_update		= kvm_mmu_notifier_dirty_update,
 };
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)