diff mbox series

[RFC] KVM: arm64: support enabling dirty log graually in small chunks

Message ID 20200309085727.1106-1-zhukeqian1@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] KVM: arm64: support enabling dirty log graually in small chunks | expand

Commit Message

zhukeqian March 9, 2020, 8:57 a.m. UTC
There is already support of enabling dirty log graually
in small chunks for x86. This adds support for arm64.

Under the Huawei Kunpeng 920 2.6GHz platform, I did some
tests with a 128G linux VM and counted the time taken of
memory_global_dirty_log_start, here is the numbers:

VM Size        Before    After optimization
128G           527ms     4ms

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
---
Cc: Jay Zhou <jianjay.zhou@huawei.com>
Cc: Paolo Bonzini <pbonzini@redhat.com> 
Cc: Peter Xu <peterx@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
---
 Documentation/virt/kvm/api.rst    |  2 +-
 arch/arm64/include/asm/kvm_host.h |  4 ++++
 virt/kvm/arm/mmu.c                | 30 ++++++++++++++++++++++--------
 3 files changed, 27 insertions(+), 9 deletions(-)

Comments

Marc Zyngier March 9, 2020, 11:45 a.m. UTC | #1
Kegian,

In the future, please Cc me on  your KVM/arm64 patches, as well as
all the reviewers mentioned in the MAINTAINERS file.

On 2020-03-09 08:57, Keqian Zhu wrote:
> There is already support of enabling dirty log graually

gradually?

> in small chunks for x86. This adds support for arm64.
> 
> Under the Huawei Kunpeng 920 2.6GHz platform, I did some
> tests with a 128G linux VM and counted the time taken of

Linux

> memory_global_dirty_log_start, here is the numbers:
> 
> VM Size        Before    After optimization
> 128G           527ms     4ms

What does this benchmark do? Can you please provide a pointer to it?

> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> ---
> Cc: Jay Zhou <jianjay.zhou@huawei.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> ---
>  Documentation/virt/kvm/api.rst    |  2 +-
>  arch/arm64/include/asm/kvm_host.h |  4 ++++
>  virt/kvm/arm/mmu.c                | 30 ++++++++++++++++++++++--------
>  3 files changed, 27 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst 
> b/Documentation/virt/kvm/api.rst
> index 0adef66585b1..89d4f2680af1 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -5735,7 +5735,7 @@ will be initialized to 1 when created.  This
> also improves performance because
>  dirty logging can be enabled gradually in small chunks on the first 
> call
>  to KVM_CLEAR_DIRTY_LOG.  KVM_DIRTY_LOG_INITIALLY_SET depends on
>  KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on
> -x86 for now).
> +x86 and arm64 for now).

What is this based on? I can't find this in -next, and you provide no
context whatsoever.

I assume this is related to this:
https://lore.kernel.org/kvm/20200227013227.1401-1-jianjay.zhou@huawei.com/

Is there a userspace counterpart to it?

>  KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the 
> name
>  KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that 
> make
> diff --git a/arch/arm64/include/asm/kvm_host.h
> b/arch/arm64/include/asm/kvm_host.h
> index d87aa609d2b6..0deb2ac7d091 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -16,6 +16,7 @@
>  #include <linux/jump_label.h>
>  #include <linux/kvm_types.h>
>  #include <linux/percpu.h>
> +#include <linux/kvm.h>
>  #include <asm/arch_gicv3.h>
>  #include <asm/barrier.h>
>  #include <asm/cpufeature.h>
> @@ -45,6 +46,9 @@
>  #define KVM_REQ_VCPU_RESET	KVM_ARCH_REQ(2)
>  #define KVM_REQ_RECORD_STEAL	KVM_ARCH_REQ(3)
> 
> +#define KVM_DIRTY_LOG_MANUAL_CAPS   
> (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
> +					KVM_DIRTY_LOG_INITIALLY_SET)
> +
>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
> 
>  extern unsigned int kvm_sve_max_vl;
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index e3b9ee268823..5c7ca84dec85 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1438,9 +1438,11 @@ static void stage2_wp_ptes(pmd_t *pmd,
> phys_addr_t addr, phys_addr_t end)
>   * @pud:	pointer to pud entry
>   * @addr:	range start address
>   * @end:	range end address
> + * @wp_ptes:	write protect ptes or not
>   */
>  static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
> -			   phys_addr_t addr, phys_addr_t end)
> +			   phys_addr_t addr, phys_addr_t end,
> +			   bool wp_ptes)

If you are going to pass extra parameters like this, make it at least
extensible (unsigned long flags, for example).

>  {
>  	pmd_t *pmd;
>  	phys_addr_t next;
> @@ -1453,7 +1455,7 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t 
> *pud,
>  			if (pmd_thp_or_huge(*pmd)) {
>  				if (!kvm_s2pmd_readonly(pmd))
>  					kvm_set_s2pmd_readonly(pmd);
> -			} else {
> +			} else if (wp_ptes) {
>  				stage2_wp_ptes(pmd, addr, next);
>  			}
>  		}
> @@ -1465,9 +1467,11 @@ static void stage2_wp_pmds(struct kvm *kvm, 
> pud_t *pud,
>   * @pgd:	pointer to pgd entry
>   * @addr:	range start address
>   * @end:	range end address
> + * @wp_ptes:	write protect ptes or not
>   */
>  static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
> -			    phys_addr_t addr, phys_addr_t end)
> +			    phys_addr_t addr, phys_addr_t end,
> +			    bool wp_ptes)
>  {
>  	pud_t *pud;
>  	phys_addr_t next;
> @@ -1480,7 +1484,7 @@ static void  stage2_wp_puds(struct kvm *kvm, 
> pgd_t *pgd,
>  				if (!kvm_s2pud_readonly(pud))
>  					kvm_set_s2pud_readonly(pud);
>  			} else {
> -				stage2_wp_pmds(kvm, pud, addr, next);
> +				stage2_wp_pmds(kvm, pud, addr, next, wp_ptes);
>  			}
>  		}
>  	} while (pud++, addr = next, addr != end);
> @@ -1491,8 +1495,10 @@ static void  stage2_wp_puds(struct kvm *kvm, 
> pgd_t *pgd,
>   * @kvm:	The KVM pointer
>   * @addr:	Start address of range
>   * @end:	End address of range
> + * @wp_ptes:	Write protect ptes or not
>   */
> -static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, 
> phys_addr_t end)
> +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr,
> +			    phys_addr_t end, bool wp_ptes)
>  {
>  	pgd_t *pgd;
>  	phys_addr_t next;
> @@ -1513,7 +1519,7 @@ static void stage2_wp_range(struct kvm *kvm,
> phys_addr_t addr, phys_addr_t end)
>  			break;
>  		next = stage2_pgd_addr_end(kvm, addr, end);
>  		if (stage2_pgd_present(kvm, *pgd))
> -			stage2_wp_puds(kvm, pgd, addr, next);
> +			stage2_wp_puds(kvm, pgd, addr, next, wp_ptes);
>  	} while (pgd++, addr = next, addr != end);
>  }
> 
> @@ -1535,6 +1541,7 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, 
> int slot)
>  	struct kvm_memslots *slots = kvm_memslots(kvm);
>  	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
>  	phys_addr_t start, end;
> +	bool wp_ptes;
> 
>  	if (WARN_ON_ONCE(!memslot))
>  		return;
> @@ -1543,7 +1550,14 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, 
> int slot)
>  	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> 
>  	spin_lock(&kvm->mmu_lock);
> -	stage2_wp_range(kvm, start, end);
> +	/*
> +	 * If we're with initial-all-set, we don't need to write protect
> +	 * any small page because they're reported as dirty already.
> +	 * However we still need to write-protect huge pages so that the
> +	 * page split can happen lazily on the first write to the huge page.
> +	 */
> +	wp_ptes = !kvm_dirty_log_manual_protect_and_init_set(kvm);
> +	stage2_wp_range(kvm, start, end, wp_ptes);
>  	spin_unlock(&kvm->mmu_lock);
>  	kvm_flush_remote_tlbs(kvm);
>  }
> @@ -1567,7 +1581,7 @@ static void
> kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>  	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
>  	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
> 
> -	stage2_wp_range(kvm, start, end);
> +	stage2_wp_range(kvm, start, end, true);
>  }
> 
>  /*

Thanks,

         M.
zhukeqian March 10, 2020, 8:26 a.m. UTC | #2
Hi Marc,

On 2020/3/9 19:45, Marc Zyngier wrote:
> Kegian,
> 
> In the future, please Cc me on  your KVM/arm64 patches, as well as
> all the reviewers mentioned in the MAINTAINERS file.
> 
> On 2020-03-09 08:57, Keqian Zhu wrote:
>> There is already support of enabling dirty log graually
> 
> gradually?
> 
Yeah, gradually. :)
>> in small chunks for x86. This adds support for arm64.
>>
>> Under the Huawei Kunpeng 920 2.6GHz platform, I did some
>> tests with a 128G linux VM and counted the time taken of
> 
> Linux
Thanks.
> 
>> memory_global_dirty_log_start, here is the numbers:
>>
>> VM Size        Before    After optimization
>> 128G           527ms     4ms
> 
> What does this benchmark do? Can you please provide a pointer to it?
> 
I will explain this in following text.
>>
>> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
>> ---
>> Cc: Jay Zhou <jianjay.zhou@huawei.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
>> ---
>>  Documentation/virt/kvm/api.rst    |  2 +-
>>  arch/arm64/include/asm/kvm_host.h |  4 ++++
>>  virt/kvm/arm/mmu.c                | 30 ++++++++++++++++++++++--------
>>  3 files changed, 27 insertions(+), 9 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 0adef66585b1..89d4f2680af1 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -5735,7 +5735,7 @@ will be initialized to 1 when created.  This
>> also improves performance because
>>  dirty logging can be enabled gradually in small chunks on the first call
>>  to KVM_CLEAR_DIRTY_LOG.  KVM_DIRTY_LOG_INITIALLY_SET depends on
>>  KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on
>> -x86 for now).
>> +x86 and arm64 for now).
> 
> What is this based on? I can't find this in -next, and you provide no
> context whatsoever.
This is based on branch "queue" of git://git.kernel.org/pub/scm/virt/kvm/kvm.git
> 
> I assume this is related to this:
> https://lore.kernel.org/kvm/20200227013227.1401-1-jianjay.zhou@huawei.com/
> 
Yes, you are right.

The background is that in [https://patchwork.kernel.org/cover/10702447/], Paolo
made an optimization for dirty log sync used by VM migration. Currently the dirty
log sync logic is getting and clearing dirty log at the same time for each KVM
memslot. This will lead to obvious problem for large guests.

As described by Paolo, "First, and less important, it can take kvm->mmu_lock for
an extended period of time.  Second, its user can actually see many false positives
in some cases." There will be enough time for guests mark page dirty again between
Qemu synchronizes dirty log and actually sends these page, so both guests and Qemu
will suffer unnecessary overhead.

Paolo introduced a new KVM ioctl. "The new KVM_CLEAR_DIRTY_LOG ioctl can operate
on a 64-page granularity rather than requiring to sync a full memslot. This way
the mmu_lock is taken for small amounts of time, and only a small amount of time
will pass between write protection of pages and the sending of their content."

The changes made by Paolo have been merge to mainline kernel. And the userspace
counterpart (Qemu) also has been updated.

After that, Jay Zhou declared an optimization about enable dirty log (The link you
paste above) based on Paolo's work. When enabling dirty log, we dont need to write
protect PTEs now. All PTEs will be write protected after first round RAM sending.

> Is there a userspace counterpart to it?
> 
As this KVM/x86 related changes have not been merged to mainline kernel, some little
modification is needed on mainline Qemu.

As I tested this patch on a 128GB RAM Linux VM with no huge pages, the time of enabling
dirty log will decrease obviously.

>>  KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name
>>  KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make
>> diff --git a/arch/arm64/include/asm/kvm_host.h
>> b/arch/arm64/include/asm/kvm_host.h
>> index d87aa609d2b6..0deb2ac7d091 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -16,6 +16,7 @@
>>  #include <linux/jump_label.h>
>>  #include <linux/kvm_types.h>
>>  #include <linux/percpu.h>
>> +#include <linux/kvm.h>
>>  #include <asm/arch_gicv3.h>
>>  #include <asm/barrier.h>
>>  #include <asm/cpufeature.h>
>> @@ -45,6 +46,9 @@
>>  #define KVM_REQ_VCPU_RESET    KVM_ARCH_REQ(2)
>>  #define KVM_REQ_RECORD_STEAL    KVM_ARCH_REQ(3)
>>
>> +#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
>> +                    KVM_DIRTY_LOG_INITIALLY_SET)
>> +
>>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
>>
>>  extern unsigned int kvm_sve_max_vl;
>> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
>> index e3b9ee268823..5c7ca84dec85 100644
>> --- a/virt/kvm/arm/mmu.c
>> +++ b/virt/kvm/arm/mmu.c
>> @@ -1438,9 +1438,11 @@ static void stage2_wp_ptes(pmd_t *pmd,
>> phys_addr_t addr, phys_addr_t end)
>>   * @pud:    pointer to pud entry
>>   * @addr:    range start address
>>   * @end:    range end address
>> + * @wp_ptes:    write protect ptes or not
>>   */
>>  static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
>> -               phys_addr_t addr, phys_addr_t end)
>> +               phys_addr_t addr, phys_addr_t end,
>> +               bool wp_ptes)
> 
> If you are going to pass extra parameters like this, make it at least
> extensible (unsigned long flags, for example).
> 
OK, I will use flags in formal patch.
>>  {
>>      pmd_t *pmd;
>>      phys_addr_t next;
>> @@ -1453,7 +1455,7 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
>>              if (pmd_thp_or_huge(*pmd)) {
>>                  if (!kvm_s2pmd_readonly(pmd))
>>                      kvm_set_s2pmd_readonly(pmd);
>> -            } else {
>> +            } else if (wp_ptes) {
>>                  stage2_wp_ptes(pmd, addr, next);
>>              }
>>          }
[...]

Thanks,
keqian
Marc Zyngier March 10, 2020, 1:16 p.m. UTC | #3
On 2020-03-10 08:26, zhukeqian wrote:
> Hi Marc,
> 
> On 2020/3/9 19:45, Marc Zyngier wrote:
>> Kegian,

[...]

>> Is there a userspace counterpart to it?
>> 
> As this KVM/x86 related changes have not been merged to mainline
> kernel, some little modification is needed on mainline Qemu.

Could you please point me to these changes?

> As I tested this patch on a 128GB RAM Linux VM with no huge pages, the
> time of enabling dirty log will decrease obviously.

I'm not sure how realistic that is. Not having huge pages tends to lead
to pretty bad performance in general...

Thanks,

         M.
zhukeqian March 11, 2020, 7:19 a.m. UTC | #4
Hi Marc,

On 2020/3/10 21:16, Marc Zyngier wrote:
> On 2020-03-10 08:26, zhukeqian wrote:
>> Hi Marc,
>>
>> On 2020/3/9 19:45, Marc Zyngier wrote:
>>> Kegian,
> 
> [...]
> 
>>> Is there a userspace counterpart to it?
>>>
>> As this KVM/x86 related changes have not been merged to mainline
>> kernel, some little modification is needed on mainline Qemu.
> 
> Could you please point me to these changes?
I made some changes locally listed below.

However, Qemu can choose to enable KVM_DIRTY_LOG_INITIALLY_SET or not.
Here I made no judgement on dirty_log_manual_caps because I just want
to verify the optimization of this patch.

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 439a4efe52..1611f644a4 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2007,14 +2007,16 @@ static int kvm_init(MachineState *ms)
     s->coalesced_pio = s->coalesced_mmio &&
                        kvm_check_extension(s, KVM_CAP_COALESCED_PIO);

-    s->manual_dirty_log_protect =
+    uint64_t dirty_log_manual_caps =
         kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
-    if (s->manual_dirty_log_protect) {
-        ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1);
+    if (dirty_log_manual_caps) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
+                                dirty_log_manual_caps);
         if (ret) {
             warn_report("Trying to enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 "
                         "but failed.  Falling back to the legacy mode. ");
-            s->manual_dirty_log_protect = false;
+        } else {
+            s->manual_dirty_log_protect = true;
         }
     }

> 
>> As I tested this patch on a 128GB RAM Linux VM with no huge pages, the
>> time of enabling dirty log will decrease obviously.
> 
> I'm not sure how realistic that is. Not having huge pages tends to lead
> to pretty bad performance in general...
Sure, this has no effect on guests which are all of huge pages.

For my understanding, once a guest has normal pages (maybe are initialized
at beginning or dissloved from huge pages), it can benefit from this patch.
> 
> Thanks,
> 
>         M.
Pretty thanks for your review.

Thanks,
Keqian
Zhoujian (jay) March 11, 2020, 7:34 a.m. UTC | #5
> -----Original Message-----
> From: zhukeqian
> Sent: Wednesday, March 11, 2020 3:20 PM
> To: Marc Zyngier <maz@kernel.org>
> Cc: kvmarm@lists.cs.columbia.edu; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org; Zhoujian (jay)
> <jianjay.zhou@huawei.com>; Sean Christopherson
> <sean.j.christopherson@intel.com>; Paolo Bonzini <pbonzini@redhat.com>;
> James Morse <james.morse@arm.com>; Julien Thierry
> <julien.thierry.kdev@gmail.com>; Suzuki K Poulose <suzuki.poulose@arm.com>
> Subject: Re: [RFC] KVM: arm64: support enabling dirty log graually in small chunks
> 
> Hi Marc,
> 
> On 2020/3/10 21:16, Marc Zyngier wrote:
> > On 2020-03-10 08:26, zhukeqian wrote:
> >> Hi Marc,
> >>
> >> On 2020/3/9 19:45, Marc Zyngier wrote:
> >>> Kegian,
> >
> > [...]
> >
> >>> Is there a userspace counterpart to it?
> >>>
> >> As this KVM/x86 related changes have not been merged to mainline
> >> kernel, some little modification is needed on mainline Qemu.
> >
> > Could you please point me to these changes?
> I made some changes locally listed below.
> 
> However, Qemu can choose to enable KVM_DIRTY_LOG_INITIALLY_SET or not.
> Here I made no judgement on dirty_log_manual_caps because I just want to
> verify the optimization of this patch.
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index
> 439a4efe52..1611f644a4 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2007,14 +2007,16 @@ static int kvm_init(MachineState *ms)
>      s->coalesced_pio = s->coalesced_mmio &&
>                         kvm_check_extension(s,
> KVM_CAP_COALESCED_PIO);
> 
> -    s->manual_dirty_log_protect =
> +    uint64_t dirty_log_manual_caps =
>          kvm_check_extension(s,
> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> -    if (s->manual_dirty_log_protect) {
> -        ret = kvm_vm_enable_cap(s,
> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1);
> +    if (dirty_log_manual_caps) {
> +        ret = kvm_vm_enable_cap(s,
> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
> +                                dirty_log_manual_caps);
>          if (ret) {
>              warn_report("Trying to enable
> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 "
>                          "but failed.  Falling back to the legacy mode. ");
> -            s->manual_dirty_log_protect = false;
> +        } else {
> +            s->manual_dirty_log_protect = true;
>          }
>      }

FYI: I had submitted a patch to the Qemu community some days ago:
https://patchwork.kernel.org/patch/11419191/

> >
> >> As I tested this patch on a 128GB RAM Linux VM with no huge pages,
> >> the time of enabling dirty log will decrease obviously.
> >
> > I'm not sure how realistic that is. Not having huge pages tends to
> > lead to pretty bad performance in general...
> Sure, this has no effect on guests which are all of huge pages.
> 
> For my understanding, once a guest has normal pages (maybe are initialized at
> beginning or dissloved from huge pages), it can benefit from this patch.

Yes, I agree.



Regards,
Jay Zhou
zhukeqian March 12, 2020, 1:45 a.m. UTC | #6
Hi Jay,

On 2020/3/11 15:34, Zhoujian (jay) wrote:
> 
> 
>> -----Original Message-----
>> From: zhukeqian
>> Sent: Wednesday, March 11, 2020 3:20 PM
>> To: Marc Zyngier <maz@kernel.org>
>> Cc: kvmarm@lists.cs.columbia.edu; kvm@vger.kernel.org;
>> linux-kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org; Zhoujian (jay)
>> <jianjay.zhou@huawei.com>; Sean Christopherson
>> <sean.j.christopherson@intel.com>; Paolo Bonzini <pbonzini@redhat.com>;
>> James Morse <james.morse@arm.com>; Julien Thierry
>> <julien.thierry.kdev@gmail.com>; Suzuki K Poulose <suzuki.poulose@arm.com>
>> Subject: Re: [RFC] KVM: arm64: support enabling dirty log graually in small chunks
>>
>> Hi Marc,
>>
>> On 2020/3/10 21:16, Marc Zyngier wrote:
>>> On 2020-03-10 08:26, zhukeqian wrote:
>>>> Hi Marc,
>>>>
>>>> On 2020/3/9 19:45, Marc Zyngier wrote:
>>>>> Kegian,
>>>
>>> [...]
>>>
>>>>> Is there a userspace counterpart to it?
>>>>>
>>>> As this KVM/x86 related changes have not been merged to mainline
>>>> kernel, some little modification is needed on mainline Qemu.
>>>
>>> Could you please point me to these changes?
>> I made some changes locally listed below.
>>
>> However, Qemu can choose to enable KVM_DIRTY_LOG_INITIALLY_SET or not.
>> Here I made no judgement on dirty_log_manual_caps because I just want to
>> verify the optimization of this patch.
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index
>> 439a4efe52..1611f644a4 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -2007,14 +2007,16 @@ static int kvm_init(MachineState *ms)
>>      s->coalesced_pio = s->coalesced_mmio &&
>>                         kvm_check_extension(s,
>> KVM_CAP_COALESCED_PIO);
>>
>> -    s->manual_dirty_log_protect =
>> +    uint64_t dirty_log_manual_caps =
>>          kvm_check_extension(s,
>> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
>> -    if (s->manual_dirty_log_protect) {
>> -        ret = kvm_vm_enable_cap(s,
>> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1);
>> +    if (dirty_log_manual_caps) {
>> +        ret = kvm_vm_enable_cap(s,
>> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
>> +                                dirty_log_manual_caps);
>>          if (ret) {
>>              warn_report("Trying to enable
>> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 "
>>                          "but failed.  Falling back to the legacy mode. ");
>> -            s->manual_dirty_log_protect = false;
>> +        } else {
>> +            s->manual_dirty_log_protect = true;
>>          }
>>      }
> 
> FYI: I had submitted a patch to the Qemu community some days ago:
> https://patchwork.kernel.org/patch/11419191/
This is very helpful, thanks.
> 
>>>
>>>> As I tested this patch on a 128GB RAM Linux VM with no huge pages,
>>>> the time of enabling dirty log will decrease obviously.
>>>
>>> I'm not sure how realistic that is. Not having huge pages tends to
>>> lead to pretty bad performance in general...
>> Sure, this has no effect on guests which are all of huge pages.
>>
>> For my understanding, once a guest has normal pages (maybe are initialized at
>> beginning or dissloved from huge pages), it can benefit from this patch.
> 
> Yes, I agree.
> 
I will send PATCH v1 soon.
> 
> 
> Regards,
> Jay Zhou
> 
> .
>
Thanks,
Keqian
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 0adef66585b1..89d4f2680af1 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5735,7 +5735,7 @@  will be initialized to 1 when created.  This also improves performance because
 dirty logging can be enabled gradually in small chunks on the first call
 to KVM_CLEAR_DIRTY_LOG.  KVM_DIRTY_LOG_INITIALLY_SET depends on
 KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on
-x86 for now).
+x86 and arm64 for now).
 
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d87aa609d2b6..0deb2ac7d091 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -16,6 +16,7 @@ 
 #include <linux/jump_label.h>
 #include <linux/kvm_types.h>
 #include <linux/percpu.h>
+#include <linux/kvm.h>
 #include <asm/arch_gicv3.h>
 #include <asm/barrier.h>
 #include <asm/cpufeature.h>
@@ -45,6 +46,9 @@ 
 #define KVM_REQ_VCPU_RESET	KVM_ARCH_REQ(2)
 #define KVM_REQ_RECORD_STEAL	KVM_ARCH_REQ(3)
 
+#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+					KVM_DIRTY_LOG_INITIALLY_SET)
+
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
 extern unsigned int kvm_sve_max_vl;
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index e3b9ee268823..5c7ca84dec85 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1438,9 +1438,11 @@  static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
  * @pud:	pointer to pud entry
  * @addr:	range start address
  * @end:	range end address
+ * @wp_ptes:	write protect ptes or not
  */
 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
-			   phys_addr_t addr, phys_addr_t end)
+			   phys_addr_t addr, phys_addr_t end,
+			   bool wp_ptes)
 {
 	pmd_t *pmd;
 	phys_addr_t next;
@@ -1453,7 +1455,7 @@  static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
 			if (pmd_thp_or_huge(*pmd)) {
 				if (!kvm_s2pmd_readonly(pmd))
 					kvm_set_s2pmd_readonly(pmd);
-			} else {
+			} else if (wp_ptes) {
 				stage2_wp_ptes(pmd, addr, next);
 			}
 		}
@@ -1465,9 +1467,11 @@  static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
  * @pgd:	pointer to pgd entry
  * @addr:	range start address
  * @end:	range end address
+ * @wp_ptes:	write protect ptes or not
  */
 static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
-			    phys_addr_t addr, phys_addr_t end)
+			    phys_addr_t addr, phys_addr_t end,
+			    bool wp_ptes)
 {
 	pud_t *pud;
 	phys_addr_t next;
@@ -1480,7 +1484,7 @@  static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
 				if (!kvm_s2pud_readonly(pud))
 					kvm_set_s2pud_readonly(pud);
 			} else {
-				stage2_wp_pmds(kvm, pud, addr, next);
+				stage2_wp_pmds(kvm, pud, addr, next, wp_ptes);
 			}
 		}
 	} while (pud++, addr = next, addr != end);
@@ -1491,8 +1495,10 @@  static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
  * @kvm:	The KVM pointer
  * @addr:	Start address of range
  * @end:	End address of range
+ * @wp_ptes:	Write protect ptes or not
  */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr,
+			    phys_addr_t end, bool wp_ptes)
 {
 	pgd_t *pgd;
 	phys_addr_t next;
@@ -1513,7 +1519,7 @@  static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 			break;
 		next = stage2_pgd_addr_end(kvm, addr, end);
 		if (stage2_pgd_present(kvm, *pgd))
-			stage2_wp_puds(kvm, pgd, addr, next);
+			stage2_wp_puds(kvm, pgd, addr, next, wp_ptes);
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -1535,6 +1541,7 @@  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
 	phys_addr_t start, end;
+	bool wp_ptes;
 
 	if (WARN_ON_ONCE(!memslot))
 		return;
@@ -1543,7 +1550,14 @@  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
 	spin_lock(&kvm->mmu_lock);
-	stage2_wp_range(kvm, start, end);
+	/*
+	 * If we're with initial-all-set, we don't need to write protect
+	 * any small page because they're reported as dirty already.
+	 * However we still need to write-protect huge pages so that the
+	 * page split can happen lazily on the first write to the huge page.
+	 */
+	wp_ptes = !kvm_dirty_log_manual_protect_and_init_set(kvm);
+	stage2_wp_range(kvm, start, end, wp_ptes);
 	spin_unlock(&kvm->mmu_lock);
 	kvm_flush_remote_tlbs(kvm);
 }
@@ -1567,7 +1581,7 @@  static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
 
-	stage2_wp_range(kvm, start, end);
+	stage2_wp_range(kvm, start, end, true);
 }
 
 /*