diff mbox series

[RFC,v2,4/8] KVM: arm64: Set DBM for previously writeable pages

Message ID 20230825093528.1637-5-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Implement SW/HW combined dirty log | expand

Commit Message

Shameerali Kolothum Thodi Aug. 25, 2023, 9:35 a.m. UTC
We only set DBM if the page is writeable (S2AP[1] == 1). But once migration
starts, CLEAR_LOG path will write protect the pages (S2AP[1] = 0) and there
isn't an easy way to differentiate the writeable pages that gets write
protected from read-only pages as we only have S2AP[1] bit to check.

Introduced a ctx->flag KVM_PGTABLE_WALK_WC_HINT to identify the dirty page
tracking related write-protect page table walk and used one of the "Reserved
for software use" bit in page descriptor to mark a page as "writeable-clean". 

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  5 +++++
 arch/arm64/kvm/hyp/pgtable.c         | 25 ++++++++++++++++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

Comments

Oliver Upton Sept. 15, 2023, 10:54 p.m. UTC | #1
On Fri, Aug 25, 2023 at 10:35:24AM +0100, Shameer Kolothum wrote:
> We only set DBM if the page is writeable (S2AP[1] == 1). But once migration
> starts, CLEAR_LOG path will write protect the pages (S2AP[1] = 0) and there
> isn't an easy way to differentiate the writeable pages that gets write
> protected from read-only pages as we only have S2AP[1] bit to check.
> 
> Introduced a ctx->flag KVM_PGTABLE_WALK_WC_HINT to identify the dirty page
> tracking related write-protect page table walk and used one of the "Reserved
> for software use" bit in page descriptor to mark a page as "writeable-clean". 
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h |  5 +++++
>  arch/arm64/kvm/hyp/pgtable.c         | 25 ++++++++++++++++++++++---
>  2 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index a12add002b89..67bcbc5984f9 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -190,6 +190,8 @@ enum kvm_pgtable_prot {
>  #define KVM_PGTABLE_PROT_RW	(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
>  #define KVM_PGTABLE_PROT_RWX	(KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
>  
> +#define KVM_PGTABLE_PROT_WC	KVM_PGTABLE_PROT_SW0  /*write-clean*/
> +
>  #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
>  #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
>  
> @@ -221,6 +223,8 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
>   *					operations required.
>   * @KVM_PGTABLE_WALK_HW_DBM:		Indicates that the attribute update is
>   *					HW DBM related.
> + * @KVM_PGTABLE_WALK_WC_HINT:		Update the page as writeable-clean(software attribute)
> + *					if we are write protecting a writeable page.

This really looks like a permission bit, not a walker flag. This should
be defined in kvm_pgtable_prot and converted to the hardware definition
in stage2_set_prot_attr(). Also, the first time I saw 'WC' I read it as
'write-combine', not writable-clean.

As I understand it, the only need for an additional software bit here is
to identify neighboring PTEs that can have DBM set while we're in the
middle of the walk right?
Shameerali Kolothum Thodi Sept. 18, 2023, 9:54 a.m. UTC | #2
> -----Original Message-----
> From: Oliver Upton [mailto:oliver.upton@linux.dev]
> Sent: 15 September 2023 23:54
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: kvmarm@lists.linux.dev; kvm@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; maz@kernel.org; will@kernel.org;
> catalin.marinas@arm.com; james.morse@arm.com;
> suzuki.poulose@arm.com; yuzenghui <yuzenghui@huawei.com>; zhukeqian
> <zhukeqian1@huawei.com>; Jonathan Cameron
> <jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com>
> Subject: Re: [RFC PATCH v2 4/8] KVM: arm64: Set DBM for previously
> writeable pages
> 
> On Fri, Aug 25, 2023 at 10:35:24AM +0100, Shameer Kolothum wrote:
> > We only set DBM if the page is writeable (S2AP[1] == 1). But once
> migration
> > starts, CLEAR_LOG path will write protect the pages (S2AP[1] = 0) and
> there
> > isn't an easy way to differentiate the writeable pages that gets write
> > protected from read-only pages as we only have S2AP[1] bit to check.
> >
> > Introduced a ctx->flag KVM_PGTABLE_WALK_WC_HINT to identify the
> dirty page
> > tracking related write-protect page table walk and used one of the
> "Reserved
> > for software use" bit in page descriptor to mark a page as
> "writeable-clean".
> >
> > Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h |  5 +++++
> >  arch/arm64/kvm/hyp/pgtable.c         | 25
> ++++++++++++++++++++++---
> >  2 files changed, 27 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h
> b/arch/arm64/include/asm/kvm_pgtable.h
> > index a12add002b89..67bcbc5984f9 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -190,6 +190,8 @@ enum kvm_pgtable_prot {
> >  #define KVM_PGTABLE_PROT_RW	(KVM_PGTABLE_PROT_R |
> KVM_PGTABLE_PROT_W)
> >  #define KVM_PGTABLE_PROT_RWX	(KVM_PGTABLE_PROT_RW |
> KVM_PGTABLE_PROT_X)
> >
> > +#define KVM_PGTABLE_PROT_WC	KVM_PGTABLE_PROT_SW0
> /*write-clean*/
> > +
> >  #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
> >  #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
> >
> > @@ -221,6 +223,8 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64
> addr, u64 end,
> >   *					operations required.
> >   * @KVM_PGTABLE_WALK_HW_DBM:		Indicates that the attribute
> update is
> >   *					HW DBM related.
> > + * @KVM_PGTABLE_WALK_WC_HINT:		Update the page as
> writeable-clean(software attribute)
> > + *					if we are write protecting a writeable page.
> 
> This really looks like a permission bit, not a walker flag. This should
> be defined in kvm_pgtable_prot and converted to the hardware definition
> in stage2_set_prot_attr(). Also, the first time I saw 'WC' I read it as
> 'write-combine', not writable-clean.

Ok. I will have a go as suggested above. In fact "writeable-clean" is an
ARM ARM term, I will make it more specific.

> 
> As I understand it, the only need for an additional software bit here is
> to identify neighboring PTEs that can have DBM set while we're in the
> middle of the walk right?

Yes, that is the purpose.

Thanks,
Shameer
Catalin Marinas Sept. 22, 2023, 3:40 p.m. UTC | #3
On Fri, Aug 25, 2023 at 10:35:24AM +0100, Shameer Kolothum wrote:
> We only set DBM if the page is writeable (S2AP[1] == 1). But once migration
> starts, CLEAR_LOG path will write protect the pages (S2AP[1] = 0) and there
> isn't an easy way to differentiate the writeable pages that gets write
> protected from read-only pages as we only have S2AP[1] bit to check.

Don't we have enough bits without an additional one?

writeable: DBM == 1 || S2AP[1] == 1
  clean: S2AP[1] == 0
  dirty: S2AP[1] == 1 (irrespective of DBM)

read-only: DBM == 0 && S2AP[1] == 0

For S1 we use a software dirty bit as well to track read-only dirty
mappings but I don't think it is necessary for S2 since KVM unmaps the
PTE when changing the VMM permission.
Shameerali Kolothum Thodi Sept. 25, 2023, 8:04 a.m. UTC | #4
> -----Original Message-----
> From: Catalin Marinas [mailto:catalin.marinas@arm.com]
> Sent: 22 September 2023 16:40
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: kvmarm@lists.linux.dev; kvm@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; maz@kernel.org; will@kernel.org;
> oliver.upton@linux.dev; james.morse@arm.com; suzuki.poulose@arm.com;
> yuzenghui <yuzenghui@huawei.com>; zhukeqian
> <zhukeqian1@huawei.com>; Jonathan Cameron
> <jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com>
> Subject: Re: [RFC PATCH v2 4/8] KVM: arm64: Set DBM for previously
> writeable pages
> 
> On Fri, Aug 25, 2023 at 10:35:24AM +0100, Shameer Kolothum wrote:
> > We only set DBM if the page is writeable (S2AP[1] == 1). But once
> migration
> > starts, CLEAR_LOG path will write protect the pages (S2AP[1] = 0) and
> there
> > isn't an easy way to differentiate the writeable pages that gets write
> > protected from read-only pages as we only have S2AP[1] bit to check.
> 
> Don't we have enough bits without an additional one?
> 
> writeable: DBM == 1 || S2AP[1] == 1
>   clean: S2AP[1] == 0
>   dirty: S2AP[1] == 1 (irrespective of DBM)
> 
> read-only: DBM == 0 && S2AP[1] == 0
> 
> For S1 we use a software dirty bit as well to track read-only dirty
> mappings but I don't think it is necessary for S2 since KVM unmaps the
> PTE when changing the VMM permission.
> 

We don't set the DBM for all the memory. In order to reduce the overhead
associated with scanning PTEs, this series sets the DBM for the nearby pages
on page fault during the migration phase.

See patch #8,
  user_mem_abort()
     kvm_arm_enable_nearby_hwdbm()

But once migration starts, on CLEAR_LOG path,
   kvm_arch_mmu_enable_log_dirty_pt_masked()
    stage2_wp_range()  --> set the page read only
    kvm_mmu_split_huge_pages() --> split huge pages and pages are read only.

This in effect means there are no writeable-clean near-by pages to set the DBM on
kvm_arm_enable_nearby_hwdbm().

To identify the pages that can be set DBM, we provide a hint to
stage2_wp_range( ) --> kvm_pgtable_stage2_wrprotect() table walker and make
use of a new software bit to mark the PTE as writeable-clean.

Hope, I am clear.

Thanks,
Shameer
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index a12add002b89..67bcbc5984f9 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -190,6 +190,8 @@  enum kvm_pgtable_prot {
 #define KVM_PGTABLE_PROT_RW	(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
 #define KVM_PGTABLE_PROT_RWX	(KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
 
+#define KVM_PGTABLE_PROT_WC	KVM_PGTABLE_PROT_SW0  /*write-clean*/
+
 #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
 #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
 
@@ -221,6 +223,8 @@  typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
  *					operations required.
  * @KVM_PGTABLE_WALK_HW_DBM:		Indicates that the attribute update is
  *					HW DBM related.
+ * @KVM_PGTABLE_WALK_WC_HINT:		Update the page as writeable-clean(software attribute)
+ *					if we are write protecting a writeable page.
  */
 enum kvm_pgtable_walk_flags {
 	KVM_PGTABLE_WALK_LEAF			= BIT(0),
@@ -231,6 +235,7 @@  enum kvm_pgtable_walk_flags {
 	KVM_PGTABLE_WALK_SKIP_BBM_TLBI		= BIT(5),
 	KVM_PGTABLE_WALK_SKIP_CMO		= BIT(6),
 	KVM_PGTABLE_WALK_HW_DBM			= BIT(7),
+	KVM_PGTABLE_WALK_WC_HINT		= BIT(8),
 };
 
 struct kvm_pgtable_visit_ctx {
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index d7a46a00a7f6..4552bfb1f274 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -69,6 +69,11 @@  struct kvm_pgtable_walk_data {
 	const u64			end;
 };
 
+static bool kvm_pgtable_walk_wc_hint(const struct kvm_pgtable_visit_ctx *ctx)
+{
+	return ctx->flags & KVM_PGTABLE_WALK_WC_HINT;
+}
+
 static bool kvm_pgtable_walk_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx)
 {
 	return ctx->flags & KVM_PGTABLE_WALK_HW_DBM;
@@ -771,13 +776,24 @@  static bool stage2_pte_writeable(kvm_pte_t pte)
 	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
 }
 
+static bool stage2_pte_is_write_clean(kvm_pte_t pte)
+{
+	return kvm_pte_valid(pte) && (pte & KVM_PGTABLE_PROT_WC);
+}
+
+static bool stage2_pte_can_be_write_clean(const struct kvm_pgtable_visit_ctx *ctx,
+					  kvm_pte_t new)
+{
+	return (stage2_pte_writeable(ctx->old) && !stage2_pte_writeable(new));
+}
+
 static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx,
 			      kvm_pte_t new)
 {
 	kvm_pte_t old_pte, pte = ctx->old;
 
-	/* Only set DBM if page is writeable */
-	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte))
+	/* Only set DBM if page is writeable-clean */
+	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_is_write_clean(pte))
 		return;
 
 	/* Clear DBM walk is not shared, update */
@@ -805,6 +821,9 @@  static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_
 	}
 
 	if (!kvm_pgtable_walk_shared(ctx)) {
+		if (kvm_pgtable_walk_wc_hint(ctx) &&
+		    stage2_pte_can_be_write_clean(ctx, new))
+			new |= KVM_PGTABLE_PROT_WC;
 		WRITE_ONCE(*ctx->ptep, new);
 		return true;
 	}
@@ -1306,7 +1325,7 @@  int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	return stage2_update_leaf_attrs(pgt, addr, size, 0,
 					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
-					NULL, NULL, 0);
+					NULL, NULL, KVM_PGTABLE_WALK_WC_HINT);
 }
 
 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)