diff mbox series

[4/4] KVM: MMU: fast cleanup D bit based on fast write protect

Message ID 1547733331-16140-5-git-send-email-ann.zhuangyanying@huawei.com (mailing list archive)
State New, archived
Headers show
Series KVM: MMU: fast cleanup D bit based on fast write protect | expand

Commit Message

Zhuang Yanying Jan. 17, 2019, 1:55 p.m. UTC
From: Zhuang Yanying <ann.zhuangyanying@huawei.com>

When live-migration with large-memory guests, vcpu may hang for a long
time while starting migration, such as 9s for 2T
(linux-5.0.0-rc2+qemu-3.1.0).
The reason is memory_global_dirty_log_start() taking too long, and the
vcpu is waiting for BQL. The page-by-page D bit clearup is the main time
consumption. I think that the idea of "KVM: MMU: fast write protect" by
xiaoguangrong, especially the function kvm_mmu_write_protect_all_pages(),
is very helpful. After a little modifcation, on his patch, can solve
this problem, 9s to 0.5s.

At the beginning of live migration, write protection is only applied to the
top-level SPTE. Then the write from vm trigger the EPT violation, with
for_each_shadow_entry write protection is performed at dirct_map.
Finally the Dirty bit of the target page(at level 1 page table) is
cleared, and the dirty page tracking is started. Of coure, the page where
GPA is located is marked dirty when mmu_set_spte.
A similar implementation on xen, just emt instead of write protection.

Signed-off-by: Zhuang Yanying <ann.zhuangyanying@huawei.com>
---
 arch/x86/kvm/mmu.c     | 8 +++++---
 arch/x86/kvm/vmx/vmx.c | 3 +--
 2 files changed, 6 insertions(+), 5 deletions(-)

Comments

Sean Christopherson Jan. 17, 2019, 4:32 p.m. UTC | #1
On Thu, Jan 17, 2019 at 01:55:31PM +0000, Zhuangyanying wrote:
> From: Zhuang Yanying <ann.zhuangyanying@huawei.com>
> 
> When live-migration with large-memory guests, vcpu may hang for a long
> time while starting migration, such as 9s for 2T
> (linux-5.0.0-rc2+qemu-3.1.0).
> The reason is memory_global_dirty_log_start() taking too long, and the
> vcpu is waiting for BQL. The page-by-page D bit clearup is the main time
> consumption. I think that the idea of "KVM: MMU: fast write protect" by
> xiaoguangrong, especially the function kvm_mmu_write_protect_all_pages(),
> is very helpful. After a little modifcation, on his patch, can solve
> this problem, 9s to 0.5s.
> 
> At the beginning of live migration, write protection is only applied to the
> top-level SPTE. Then the write from vm trigger the EPT violation, with
> for_each_shadow_entry write protection is performed at dirct_map.
> Finally the Dirty bit of the target page(at level 1 page table) is
> cleared, and the dirty page tracking is started. Of coure, the page where
> GPA is located is marked dirty when mmu_set_spte.
> A similar implementation on xen, just emt instead of write protection.
> 
> Signed-off-by: Zhuang Yanying <ann.zhuangyanying@huawei.com>
> ---
>  arch/x86/kvm/mmu.c     | 8 +++++---
>  arch/x86/kvm/vmx/vmx.c | 3 +--
>  2 files changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 047b897..a18bcc0 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3257,7 +3257,10 @@ static bool mmu_load_shadow_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  			break;
>  
>  		if (is_last_spte(spte, sp->role.level)) {
> -			flush |= spte_write_protect(sptep, false);
> +			if (sp->role.level == PT_PAGE_TABLE_LEVEL)
> +				flush |= spte_clear_dirty(sptep);
> +			else
> +				flush |= spte_write_protect(sptep, false);
>  			continue;
>  		}
>  
> @@ -6114,7 +6117,6 @@ void kvm_mmu_write_protect_all_pages(struct kvm *kvm, bool write_protect)
>  {
>  	u64 wp_all_indicator, kvm_wp_all_gen;
>  
> -	mutex_lock(&kvm->slots_lock);
>  	wp_all_indicator = get_write_protect_all_indicator(kvm);
>  	kvm_wp_all_gen = get_write_protect_all_gen(wp_all_indicator);
>  
> @@ -6134,8 +6136,8 @@ void kvm_mmu_write_protect_all_pages(struct kvm *kvm, bool write_protect)
>  	 */
>  	if (write_protect)
>  		kvm_reload_remote_mmus(kvm);
> -	mutex_unlock(&kvm->slots_lock);

Why is the lock removed?  And why was it added in the first place?

>  }
> +EXPORT_SYMBOL_GPL(kvm_mmu_write_protect_all_pages);
>  
>  static unsigned long
>  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index f6915f1..5236a07 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7180,8 +7180,7 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
>  				     struct kvm_memory_slot *slot)
>  {
> -	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
> -	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
> +	kvm_mmu_write_protect_all_pages(kvm, true);

What's the purpose of having @write_protect if
kvm_mmu_write_protect_all_pages() is only ever called to enable
protection?  If there's no known scenario where write protection is
explicitly disabled then there's no need for WP_ALL_ENABLE_MASK, i.e. a
non-zero generation would indicate write protection is enabled.  That'd
simplify the code and clean up the atomic usage.

>  }
>  
>  static void vmx_slot_disable_log_dirty(struct kvm *kvm,
> -- 
> 1.8.3.1
> 
>
Zhuang Yanying Jan. 21, 2019, 6:37 a.m. UTC | #2
> -----Original Message-----
> From: Sean Christopherson [mailto:sean.j.christopherson@intel.com]
> Sent: Friday, January 18, 2019 12:32 AM
> To: Zhuangyanying <ann.zhuangyanying@huawei.com>
> Cc: xiaoguangrong@tencent.com; pbonzini@redhat.com; Gonglei (Arei)
> <arei.gonglei@huawei.com>; qemu-devel@nongnu.org; kvm@vger.kernel.org;
> wangxin (U) <wangxinxin.wang@huawei.com>; Liujinsong (Paul)
> <liu.jinsong@huawei.com>
> Subject: Re: [PATCH 4/4] KVM: MMU: fast cleanup D bit based on fast write
> protect
> 
> On Thu, Jan 17, 2019 at 01:55:31PM +0000, Zhuangyanying wrote:
> > From: Zhuang Yanying <ann.zhuangyanying@huawei.com>
> >
> > When live-migration with large-memory guests, vcpu may hang for a long
> > time while starting migration, such as 9s for 2T
> > (linux-5.0.0-rc2+qemu-3.1.0).
> > The reason is memory_global_dirty_log_start() taking too long, and the
> > vcpu is waiting for BQL. The page-by-page D bit clearup is the main
> > time consumption. I think that the idea of "KVM: MMU: fast write
> > protect" by xiaoguangrong, especially the function
> > kvm_mmu_write_protect_all_pages(),
> > is very helpful. After a little modifcation, on his patch, can solve
> > this problem, 9s to 0.5s.
> >
> > At the beginning of live migration, write protection is only applied
> > to the top-level SPTE. Then the write from vm trigger the EPT
> > violation, with for_each_shadow_entry write protection is performed at
> dirct_map.
> > Finally the Dirty bit of the target page(at level 1 page table) is
> > cleared, and the dirty page tracking is started. Of coure, the page
> > where GPA is located is marked dirty when mmu_set_spte.
> > A similar implementation on xen, just emt instead of write protection.
> >
> > Signed-off-by: Zhuang Yanying <ann.zhuangyanying@huawei.com>
> > ---
> >  arch/x86/kvm/mmu.c     | 8 +++++---
> >  arch/x86/kvm/vmx/vmx.c | 3 +--
> >  2 files changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index
> > 047b897..a18bcc0 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3257,7 +3257,10 @@ static bool mmu_load_shadow_page(struct kvm
> *kvm, struct kvm_mmu_page *sp)
> >  			break;
> >
> >  		if (is_last_spte(spte, sp->role.level)) {
> > -			flush |= spte_write_protect(sptep, false);
> > +			if (sp->role.level == PT_PAGE_TABLE_LEVEL)
> > +				flush |= spte_clear_dirty(sptep);
> > +			else
> > +				flush |= spte_write_protect(sptep, false);
> >  			continue;
> >  		}
> >
> > @@ -6114,7 +6117,6 @@ void kvm_mmu_write_protect_all_pages(struct
> kvm
> > *kvm, bool write_protect)  {
> >  	u64 wp_all_indicator, kvm_wp_all_gen;
> >
> > -	mutex_lock(&kvm->slots_lock);
> >  	wp_all_indicator = get_write_protect_all_indicator(kvm);
> >  	kvm_wp_all_gen = get_write_protect_all_gen(wp_all_indicator);
> >
> > @@ -6134,8 +6136,8 @@ void kvm_mmu_write_protect_all_pages(struct
> kvm *kvm, bool write_protect)
> >  	 */
> >  	if (write_protect)
> >  		kvm_reload_remote_mmus(kvm);
> > -	mutex_unlock(&kvm->slots_lock);
> 
> Why is the lock removed?  And why was it added in the first place?
> 
The original purpose of fast write protect is to implement lock-free get_dirty_log,
kvm_mmu_write_protect_all_pages is a stand-alone kvm API. See
https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg04370.html
A total of 7 patches, I only need the first 3 patches to achieve step-by-step
page table traversal. In order to maintain the integrity of the xiaoguangrong
patch, I did not directly modify it on his patch.
> >  }
> > +EXPORT_SYMBOL_GPL(kvm_mmu_write_protect_all_pages);
> >
> >  static unsigned long
> >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > f6915f1..5236a07 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -7180,8 +7180,7 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu,
> > int cpu)  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
> >  				     struct kvm_memory_slot *slot)  {
> > -	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
> > -	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
> > +	kvm_mmu_write_protect_all_pages(kvm, true);
> 
> What's the purpose of having @write_protect if
> kvm_mmu_write_protect_all_pages() is only ever called to enable
> protection?  If there's no known scenario where write protection is
> explicitly disabled then there's no need for WP_ALL_ENABLE_MASK, i.e. a
> non-zero generation would indicate write protection is enabled.  That'd
> simplify the code and clean up the atomic usage.
> 
In the live migration, The large page split depends on the creation of memslot->dirty_bitmap
in the function __kvm_set_memory_region().
The interface design between qemu and kvm to enable dirty log is one by one in slot units.
In order to enable dirty page tracking of the entire vm, it is necessary to call
kvm_mmu_write_protect_all_pages multiple times. The page table update request can
be merged for processing by the atomic usage. This method is not elegant, but it works.
Complete the creation of all solt's dirty_bitmap in an API, just call 
kvm_mmu_write_protect_all_pages once, need more implementation changes, even qemu.

Best regards,
-Zhuang Yanying
> >  }
> >
> >  static void vmx_slot_disable_log_dirty(struct kvm *kvm,
> > --
> > 1.8.3.1
> >
> >
Sean Christopherson Jan. 22, 2019, 3:17 p.m. UTC | #3
On Mon, Jan 21, 2019 at 06:37:36AM +0000, Zhuangyanying wrote:
> 
> > >  	u64 wp_all_indicator, kvm_wp_all_gen;
> > >
> > > -	mutex_lock(&kvm->slots_lock);
> > >  	wp_all_indicator = get_write_protect_all_indicator(kvm);
> > >  	kvm_wp_all_gen = get_write_protect_all_gen(wp_all_indicator);
> > >
> > > @@ -6134,8 +6136,8 @@ void kvm_mmu_write_protect_all_pages(struct
> > kvm *kvm, bool write_protect)
> > >  	 */
> > >  	if (write_protect)
> > >  		kvm_reload_remote_mmus(kvm);
> > > -	mutex_unlock(&kvm->slots_lock);
> > 
> > Why is the lock removed?  And why was it added in the first place?
> > 
> The original purpose of fast write protect is to implement lock-free get_dirty_log,
> kvm_mmu_write_protect_all_pages is a stand-alone kvm API. See
> https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg04370.html
> A total of 7 patches, I only need the first 3 patches to achieve step-by-step
> page table traversal. In order to maintain the integrity of the xiaoguangrong
> patch, I did not directly modify it on his patch.

That's not a sufficient argument for adding locking and removing it one
patch later.

> > >  }
> > > +EXPORT_SYMBOL_GPL(kvm_mmu_write_protect_all_pages);
> > >
> > >  static unsigned long
> > >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > > f6915f1..5236a07 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -7180,8 +7180,7 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu,
> > > int cpu)  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
> > >  				     struct kvm_memory_slot *slot)  {
> > > -	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
> > > -	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
> > > +	kvm_mmu_write_protect_all_pages(kvm, true);
> > 
> > What's the purpose of having @write_protect if
> > kvm_mmu_write_protect_all_pages() is only ever called to enable
> > protection?  If there's no known scenario where write protection is
> > explicitly disabled then there's no need for WP_ALL_ENABLE_MASK, i.e. a
> > non-zero generation would indicate write protection is enabled.  That'd
> > simplify the code and clean up the atomic usage.
> > 
> In the live migration, The large page split depends on the creation of memslot->dirty_bitmap
> in the function __kvm_set_memory_region().
> The interface design between qemu and kvm to enable dirty log is one by one in slot units.
> In order to enable dirty page tracking of the entire vm, it is necessary to call
> kvm_mmu_write_protect_all_pages multiple times. The page table update request can
> be merged for processing by the atomic usage. This method is not elegant, but it works.
> Complete the creation of all solt's dirty_bitmap in an API, just call 
> kvm_mmu_write_protect_all_pages once, need more implementation changes, even qemu.

Calling kvm_mmu_write_protect_all_pages() multiple times is fine.  My
question was regarding the 'write_protect' parameter.  If all callers
always pass %true for 'write_protect' then why does the parameter exist?
And eliminating the parameter means you don't need an 'enable' flag
buried in the generation, which would simplify the implementation.
Zhuang Yanying Jan. 23, 2019, 6:38 p.m. UTC | #4
> -----Original Message-----
> From: Sean Christopherson [mailto:sean.j.christopherson@intel.com]
> Sent: Tuesday, January 22, 2019 11:17 PM
> To: Zhuangyanying <ann.zhuangyanying@huawei.com>
> Cc: xiaoguangrong@tencent.com; pbonzini@redhat.com; Gonglei (Arei)
> <arei.gonglei@huawei.com>; qemu-devel@nongnu.org; kvm@vger.kernel.org;
> wangxin (U) <wangxinxin.wang@huawei.com>; Liujinsong (Paul)
> <liu.jinsong@huawei.com>; Zhoujian (jay) <jianjay.zhou@huawei.com>;
> xiaoguangrong.eric@gmail.com
> Subject: Re: [PATCH 4/4] KVM: MMU: fast cleanup D bit based on fast write
> protect
> 
> On Mon, Jan 21, 2019 at 06:37:36AM +0000, Zhuangyanying wrote:
> >
> > > >  	u64 wp_all_indicator, kvm_wp_all_gen;
> > > >
> > > > -	mutex_lock(&kvm->slots_lock);
> > > >  	wp_all_indicator = get_write_protect_all_indicator(kvm);
> > > >  	kvm_wp_all_gen = get_write_protect_all_gen(wp_all_indicator);
> > > >
> > > > @@ -6134,8 +6136,8 @@ void
> kvm_mmu_write_protect_all_pages(struct
> > > kvm *kvm, bool write_protect)
> > > >  	 */
> > > >  	if (write_protect)
> > > >  		kvm_reload_remote_mmus(kvm);
> > > > -	mutex_unlock(&kvm->slots_lock);
> > >
> > > Why is the lock removed?  And why was it added in the first place?
> > >
> > The original purpose of fast write protect is to implement lock-free
> > get_dirty_log, kvm_mmu_write_protect_all_pages is a stand-alone kvm
> > API. See
> > https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg04370.html
> > A total of 7 patches, I only need the first 3 patches to achieve
> > step-by-step page table traversal. In order to maintain the integrity
> > of the xiaoguangrong patch, I did not directly modify it on his patch.
> 
> That's not a sufficient argument for adding locking and removing it one patch
> later.
> 
> > > >  }
> > > > +EXPORT_SYMBOL_GPL(kvm_mmu_write_protect_all_pages);
> > > >
> > > >  static unsigned long
> > > >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control
> > > > *sc) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > > index
> > > > f6915f1..5236a07 100644
> > > > --- a/arch/x86/kvm/vmx/vmx.c
> > > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > > @@ -7180,8 +7180,7 @@ static void vmx_sched_in(struct kvm_vcpu
> > > > *vcpu, int cpu)  static void vmx_slot_enable_log_dirty(struct kvm *kvm,
> > > >  				     struct kvm_memory_slot *slot)  {
> > > > -	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
> > > > -	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
> > > > +	kvm_mmu_write_protect_all_pages(kvm, true);
> > >
> > > What's the purpose of having @write_protect if
> > > kvm_mmu_write_protect_all_pages() is only ever called to enable
> > > protection?  If there's no known scenario where write protection is
> > > explicitly disabled then there's no need for WP_ALL_ENABLE_MASK,
> > > i.e. a non-zero generation would indicate write protection is
> > > enabled.  That'd simplify the code and clean up the atomic usage.
> > >
> > In the live migration, The large page split depends on the creation of
> > memslot->dirty_bitmap in the function __kvm_set_memory_region().
> > The interface design between qemu and kvm to enable dirty log is one by one
> in slot units.
> > In order to enable dirty page tracking of the entire vm, it is
> > necessary to call kvm_mmu_write_protect_all_pages multiple times. The
> > page table update request can be merged for processing by the atomic usage.
> This method is not elegant, but it works.
> > Complete the creation of all solt's dirty_bitmap in an API, just call
> > kvm_mmu_write_protect_all_pages once, need more implementation
> changes, even qemu.
> 
> Calling kvm_mmu_write_protect_all_pages() multiple times is fine.  My
> question was regarding the 'write_protect' parameter.  If all callers always
> pass %true for 'write_protect' then why does the parameter exist?
> And eliminating the parameter means you don't need an 'enable' flag buried in
> the generation, which would simplify the implementation.

In fact, when cancel migration for 2T vm, the func memory_global_dirty_log_stop()
will hold the BQL for 12s, vmx_slot_disable_log_dirty() for 4s, 
kvm_mmu_zap_collapsible_sptes() for 2s, double because smm is enabled by default.
What about:
static void vmx_slot_disable_log_dirty(struct kvm *kvm,
				       struct kvm_memory_slot *slot)
{
+	kvm_mmu_write_protect_all_pages(kvm, false);
-	kvm_mmu_slot_set_dirty(kvm, slot);
}

Sorry, this patch is not complete. I will send patch v2 soon.

Best regards,
-Zhuang Yanying
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 047b897..a18bcc0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3257,7 +3257,10 @@  static bool mmu_load_shadow_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 			break;
 
 		if (is_last_spte(spte, sp->role.level)) {
-			flush |= spte_write_protect(sptep, false);
+			if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+				flush |= spte_clear_dirty(sptep);
+			else
+				flush |= spte_write_protect(sptep, false);
 			continue;
 		}
 
@@ -6114,7 +6117,6 @@  void kvm_mmu_write_protect_all_pages(struct kvm *kvm, bool write_protect)
 {
 	u64 wp_all_indicator, kvm_wp_all_gen;
 
-	mutex_lock(&kvm->slots_lock);
 	wp_all_indicator = get_write_protect_all_indicator(kvm);
 	kvm_wp_all_gen = get_write_protect_all_gen(wp_all_indicator);
 
@@ -6134,8 +6136,8 @@  void kvm_mmu_write_protect_all_pages(struct kvm *kvm, bool write_protect)
 	 */
 	if (write_protect)
 		kvm_reload_remote_mmus(kvm);
-	mutex_unlock(&kvm->slots_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_write_protect_all_pages);
 
 static unsigned long
 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6915f1..5236a07 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7180,8 +7180,7 @@  static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
 				     struct kvm_memory_slot *slot)
 {
-	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
-	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+	kvm_mmu_write_protect_all_pages(kvm, true);
 }
 
 static void vmx_slot_disable_log_dirty(struct kvm *kvm,