diff mbox series

[RFC,v2,3/8] KVM: arm64: Add some HW_DBM related pgtable interfaces

Message ID 20230825093528.1637-4-shameerali.kolothum.thodi@huawei.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Implement SW/HW combined dirty log | expand

Commit Message

Shameerali Kolothum Thodi Aug. 25, 2023, 9:35 a.m. UTC
From: Keqian Zhu <zhukeqian1@huawei.com>

This adds set_dbm, clear_dbm and sync_dirty interfaces in pgtable
layer. (1) set_dbm: Set DBM bit for last level PTE of a specified
range. TLBI is completed. (2) clear_dbm: Clear DBM bit for last
level PTE of a specified range. TLBI is not acted. (3) sync_dirty:
Scan last level PTE of a specific range. Log dirty if PTE is writeable.

Besides, save the dirty state of PTE if it's invalided by map or
unmap.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 arch/arm64/include/asm/kvm_pgtable.h | 45 +++++++++++++
 arch/arm64/kernel/image-vars.h       |  2 +
 arch/arm64/kvm/hyp/pgtable.c         | 98 ++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)

Comments

Oliver Upton Sept. 15, 2023, 10:22 p.m. UTC | #1
Hi Shameer,

On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote:
> From: Keqian Zhu <zhukeqian1@huawei.com>
> 
> This adds set_dbm, clear_dbm and sync_dirty interfaces in pgtable
> layer. (1) set_dbm: Set DBM bit for last level PTE of a specified
> range. TLBI is completed. (2) clear_dbm: Clear DBM bit for last
> level PTE of a specified range. TLBI is not acted. (3) sync_dirty:
> Scan last level PTE of a specific range. Log dirty if PTE is writeable.
> 
> Besides, save the dirty state of PTE if it's invalided by map or
> unmap.
> 
> Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h | 45 +++++++++++++
>  arch/arm64/kernel/image-vars.h       |  2 +
>  arch/arm64/kvm/hyp/pgtable.c         | 98 ++++++++++++++++++++++++++++
>  3 files changed, 145 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 3f96bdd2086f..a12add002b89 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -578,6 +578,51 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
>   */
>  int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
>  
> +/**
> + * kvm_pgtable_stage2_clear_dbm() - Clear DBM of guest stage-2 address range
> + *                                  without TLB invalidation (only last level).
> + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	Intermediate physical address from which to clear DBM,
> + * @size:	Size of the range.
> + *
> + * The offset of @addr within a page is ignored and @size is rounded-up to
> + * the next page boundary.
> + *
> + * Note that it is the caller's responsibility to invalidate the TLB after
> + * calling this function to ensure that the disabled HW dirty are visible
> + * to the CPUs.
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size);
> +
> +/**
> + * kvm_pgtable_stage2_set_dbm() - Set DBM of guest stage-2 address range to
> + *                                enable HW dirty (only last level).
> + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	Intermediate physical address from which to set DBM.
> + * @size:	Size of the range.
> + *
> + * The offset of @addr within a page is ignored and @size is rounded-up to
> + * the next page boundary.
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size);
> +
> +/**
> + * kvm_pgtable_stage2_sync_dirty() - Sync HW dirty state into memslot.
> + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	Intermediate physical address from which to sync.
> + * @size:	Size of the range.
> + *
> + * The offset of @addr within a page is ignored and @size is rounded-up to
> + * the next page boundary.
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, u64 size);
> +
>  /**
>   * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
>   *                                  without TLB invalidation.
> diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
> index 35f3c7959513..2ca600e3d637 100644
> --- a/arch/arm64/kernel/image-vars.h
> +++ b/arch/arm64/kernel/image-vars.h
> @@ -68,6 +68,8 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors);
>  KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
>  KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
>  
> +KVM_NVHE_ALIAS(mark_page_dirty);
> +

This doesn't make any sense besides satisfying the linker. The hyp page
table walkers will *never* need to mark a page as dirty.

Consider adding a new function pointer to kvm_pgtable_mm_ops and only
set it for the 'normal' stage-2 mm ops in mmu.c.

> +static bool stage2_pte_writeable(kvm_pte_t pte)
> +{
> +	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
> +}
> +
> +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx,
> +			      kvm_pte_t new)
> +{
> +	kvm_pte_t old_pte, pte = ctx->old;
> +
> +	/* Only set DBM if page is writeable */
> +	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte))
> +		return;
> +
> +	/* Clear DBM walk is not shared, update */
> +	if (!kvm_pgtable_walk_shared(ctx)) {
> +		WRITE_ONCE(*ctx->ptep, new);
> +		return;
> +	}
> +
> +	do {
> +		old_pte = pte;
> +		pte = new;
> +
> +		if (old_pte == pte)
> +			break;
> +
> +		pte = cmpxchg_relaxed(ctx->ptep, old_pte, pte);
> +	} while (pte != old_pte);
> +}
> +

We don't need a separate walker for updating the DBM state in the
page tables. Can't we treat it like any other permission bit and use
something like kvm_pgtable_stage2_relax_perms()?

Also, what is the purpose of retrying the update on a descriptor if the
cmpxchg() fails? Everywhere else in the page table walker code we bail
and retry execution since that is a clear indication of a race. In all
likelihood the other vCPU thread was either trying to update DBM or
service a permission fault.

>  static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
>  {
> +	if (kvm_pgtable_walk_hw_dbm(ctx)) {
> +		kvm_update_hw_dbm(ctx, new);
> +		return true;
> +	}
> +

Why are you trying to circumvent the primitive for setting stage-2 PTEs?

>  	if (!kvm_pgtable_walk_shared(ctx)) {
>  		WRITE_ONCE(*ctx->ptep, new);
>  		return true;
> @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
>  	    stage2_pte_executable(new))
>  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
>  
> +	/* Save the possible hardware dirty info */
> +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> +	    stage2_pte_writeable(ctx->old))
> +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT);
> +
>  	stage2_make_pte(ctx, new);
>  
>  	return 0;
> @@ -1125,6 +1168,11 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>  	 */
>  	stage2_unmap_put_pte(ctx, mmu, mm_ops);
>  
> +	/* Save the possible hardware dirty info */
> +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> +	    stage2_pte_writeable(ctx->old))
> +		mark_page_dirty(kvm_s2_mmu_to_kvm(mmu), ctx->addr >> PAGE_SHIFT);
> +
>  	if (need_flush && mm_ops->dcache_clean_inval_poc)
>  		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
>  					       kvm_granule_size(ctx->level));
> @@ -1230,6 +1278,30 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
>  	return 0;
>  }
>  
> +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size)
> +{
> +	int ret;
> +	u64 offset;
> +
> +	ret = stage2_update_leaf_attrs(pgt, addr, size, KVM_PTE_LEAF_ATTR_HI_S2_DBM, 0,
> +				       NULL, NULL, KVM_PGTABLE_WALK_HW_DBM |
> +				       KVM_PGTABLE_WALK_SHARED);
> +	if (!ret)
> +		return ret;
> +
> +	for (offset = 0; offset < size; offset += PAGE_SIZE)
> +		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr + offset, 3);
> +
> +	return 0;
> +}
> +
> +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size)
> +{
> +	return stage2_update_leaf_attrs(pgt, addr, size,
> +					0, KVM_PTE_LEAF_ATTR_HI_S2_DBM,
> +					NULL, NULL, KVM_PGTABLE_WALK_HW_DBM);
> +}
> +
>  int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
>  {
>  	return stage2_update_leaf_attrs(pgt, addr, size, 0,
> @@ -1329,6 +1401,32 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
>  	return ret;
>  }
>  
> +static int stage2_sync_dirty_walker(const struct kvm_pgtable_visit_ctx *ctx,
> +				    enum kvm_pgtable_walk_flags visit)
> +{
> +	kvm_pte_t pte = READ_ONCE(*ctx->ptep);

ctx->old is fetched in __kvm_pgtable_visit(), and the whole
parallelization scheme for page table walkers depends on us being
careful to read the PTE once per level. Do you need to reread it here?
Shameerali Kolothum Thodi Sept. 18, 2023, 9:53 a.m. UTC | #2
> -----Original Message-----
> From: Oliver Upton [mailto:oliver.upton@linux.dev]
> Sent: 15 September 2023 23:23
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: kvmarm@lists.linux.dev; kvm@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; maz@kernel.org; will@kernel.org;
> catalin.marinas@arm.com; james.morse@arm.com;
> suzuki.poulose@arm.com; yuzenghui <yuzenghui@huawei.com>; zhukeqian
> <zhukeqian1@huawei.com>; Jonathan Cameron
> <jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com>
> Subject: Re: [RFC PATCH v2 3/8] KVM: arm64: Add some HW_DBM related
> pgtable interfaces
> 
> Hi Shameer,
> 
> On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote:
> > From: Keqian Zhu <zhukeqian1@huawei.com>
> >
> > This adds set_dbm, clear_dbm and sync_dirty interfaces in pgtable
> > layer. (1) set_dbm: Set DBM bit for last level PTE of a specified
> > range. TLBI is completed. (2) clear_dbm: Clear DBM bit for last
> > level PTE of a specified range. TLBI is not acted. (3) sync_dirty:
> > Scan last level PTE of a specific range. Log dirty if PTE is writeable.
> >
> > Besides, save the dirty state of PTE if it's invalided by map or
> > unmap.
> >
> > Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
> > Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h | 45 +++++++++++++
> >  arch/arm64/kernel/image-vars.h       |  2 +
> >  arch/arm64/kvm/hyp/pgtable.c         | 98
> ++++++++++++++++++++++++++++
> >  3 files changed, 145 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h
> b/arch/arm64/include/asm/kvm_pgtable.h
> > index 3f96bdd2086f..a12add002b89 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -578,6 +578,51 @@ int kvm_pgtable_stage2_set_owner(struct
> kvm_pgtable *pgt, u64 addr, u64 size,
> >   */
> >  int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64
> size);
> >
> > +/**
> > + * kvm_pgtable_stage2_clear_dbm() - Clear DBM of guest stage-2 address
> range
> > + *                                  without TLB invalidation (only
> last level).
> > + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:	Intermediate physical address from which to clear DBM,
> > + * @size:	Size of the range.
> > + *
> > + * The offset of @addr within a page is ignored and @size is rounded-up
> to
> > + * the next page boundary.
> > + *
> > + * Note that it is the caller's responsibility to invalidate the TLB after
> > + * calling this function to ensure that the disabled HW dirty are visible
> > + * to the CPUs.
> > + *
> > + * Return: 0 on success, negative error code on failure.
> > + */
> > +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr,
> u64 size);
> > +
> > +/**
> > + * kvm_pgtable_stage2_set_dbm() - Set DBM of guest stage-2 address
> range to
> > + *                                enable HW dirty (only last level).
> > + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:	Intermediate physical address from which to set DBM.
> > + * @size:	Size of the range.
> > + *
> > + * The offset of @addr within a page is ignored and @size is rounded-up
> to
> > + * the next page boundary.
> > + *
> > + * Return: 0 on success, negative error code on failure.
> > + */
> > +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64
> size);
> > +
> > +/**
> > + * kvm_pgtable_stage2_sync_dirty() - Sync HW dirty state into memslot.
> > + * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:	Intermediate physical address from which to sync.
> > + * @size:	Size of the range.
> > + *
> > + * The offset of @addr within a page is ignored and @size is rounded-up
> to
> > + * the next page boundary.
> > + *
> > + * Return: 0 on success, negative error code on failure.
> > + */
> > +int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr,
> u64 size);
> > +
> >  /**
> >   * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2
> address range
> >   *                                  without TLB invalidation.
> > diff --git a/arch/arm64/kernel/image-vars.h
> b/arch/arm64/kernel/image-vars.h
> > index 35f3c7959513..2ca600e3d637 100644
> > --- a/arch/arm64/kernel/image-vars.h
> > +++ b/arch/arm64/kernel/image-vars.h
> > @@ -68,6 +68,8 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors);
> >  KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
> >  KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
> >
> > +KVM_NVHE_ALIAS(mark_page_dirty);
> > +
> 
> This doesn't make any sense besides satisfying the linker. The hyp page
> table walkers will *never* need to mark a page as dirty.
> 
> Consider adding a new function pointer to kvm_pgtable_mm_ops and only
> set it for the 'normal' stage-2 mm ops in mmu.c.

Yes, this is only to satisfy the linker for NVHE. I will add the function pointer
to handle this.

> 
> > +static bool stage2_pte_writeable(kvm_pte_t pte)
> > +{
> > +	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
> > +}
> > +
> > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx,
> > +			      kvm_pte_t new)
> > +{
> > +	kvm_pte_t old_pte, pte = ctx->old;
> > +
> > +	/* Only set DBM if page is writeable */
> > +	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM)
> && !stage2_pte_writeable(pte))
> > +		return;
> > +
> > +	/* Clear DBM walk is not shared, update */
> > +	if (!kvm_pgtable_walk_shared(ctx)) {
> > +		WRITE_ONCE(*ctx->ptep, new);
> > +		return;
> > +	}
> > +
> > +	do {
> > +		old_pte = pte;
> > +		pte = new;
> > +
> > +		if (old_pte == pte)
> > +			break;
> > +
> > +		pte = cmpxchg_relaxed(ctx->ptep, old_pte, pte);
> > +	} while (pte != old_pte);
> > +}
> > +
> 
> We don't need a separate walker for updating the DBM state in the
> page tables. Can't we treat it like any other permission bit and use
> something like kvm_pgtable_stage2_relax_perms()?

I treated it separately to isolate it from rest of PTE updates for ease of
debug/test initially. But yes, looks like we can treat it generally. 

> Also, what is the purpose of retrying the update on a descriptor if the
> cmpxchg() fails? Everywhere else in the page table walker code we bail
> and retry execution since that is a clear indication of a race. In all
> likelihood the other vCPU thread was either trying to update DBM or
> service a permission fault.

Retry was added to avoid any situation where it will return without setting
the DBM and we end up scanning for dirty pages irrespectively. I will add
some debug info to see the number of races we are getting and its impact
on performance.

> 
> >  static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx,
> kvm_pte_t new)
> >  {
> > +	if (kvm_pgtable_walk_hw_dbm(ctx)) {
> > +		kvm_update_hw_dbm(ctx, new);
> > +		return true;
> > +	}
> > +
> 
> Why are you trying to circumvent the primitive for setting stage-2 PTEs?

Not sure it actually does that from a functionality point of view, as the flag 
_WALK_HW_DBM is only set from set_dbm/clear_dbm paths. But yes we
can do better if we get rid of the above retry logic, I guess.

> 
> >  	if (!kvm_pgtable_walk_shared(ctx)) {
> >  		WRITE_ONCE(*ctx->ptep, new);
> >  		return true;
> > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const
> struct kvm_pgtable_visit_ctx *ctx,
> >  	    stage2_pte_executable(new))
> >  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops),
> granule);
> >
> > +	/* Save the possible hardware dirty info */
> > +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> > +	    stage2_pte_writeable(ctx->old))
> > +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >>
> PAGE_SHIFT);
> > +
> >  	stage2_make_pte(ctx, new);
> >
> >  	return 0;
> > @@ -1125,6 +1168,11 @@ static int stage2_unmap_walker(const struct
> kvm_pgtable_visit_ctx *ctx,
> >  	 */
> >  	stage2_unmap_put_pte(ctx, mmu, mm_ops);
> >
> > +	/* Save the possible hardware dirty info */
> > +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> > +	    stage2_pte_writeable(ctx->old))
> > +		mark_page_dirty(kvm_s2_mmu_to_kvm(mmu), ctx->addr >>
> PAGE_SHIFT);
> > +
> >  	if (need_flush && mm_ops->dcache_clean_inval_poc)
> >  		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old,
> mm_ops),
> >  					       kvm_granule_size(ctx->level));
> > @@ -1230,6 +1278,30 @@ static int stage2_update_leaf_attrs(struct
> kvm_pgtable *pgt, u64 addr,
> >  	return 0;
> >  }
> >
> > +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64
> size)
> > +{
> > +	int ret;
> > +	u64 offset;
> > +
> > +	ret = stage2_update_leaf_attrs(pgt, addr, size,
> KVM_PTE_LEAF_ATTR_HI_S2_DBM, 0,
> > +				       NULL, NULL, KVM_PGTABLE_WALK_HW_DBM |
> > +				       KVM_PGTABLE_WALK_SHARED);
> > +	if (!ret)
> > +		return ret;
> > +
> > +	for (offset = 0; offset < size; offset += PAGE_SIZE)
> > +		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr +
> offset, 3);
> > +
> > +	return 0;
> > +}
> > +
> > +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr,
> u64 size)
> > +{
> > +	return stage2_update_leaf_attrs(pgt, addr, size,
> > +					0, KVM_PTE_LEAF_ATTR_HI_S2_DBM,
> > +					NULL, NULL, KVM_PGTABLE_WALK_HW_DBM);
> > +}
> > +
> >  int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr,
> u64 size)
> >  {
> >  	return stage2_update_leaf_attrs(pgt, addr, size, 0,
> > @@ -1329,6 +1401,32 @@ int kvm_pgtable_stage2_relax_perms(struct
> kvm_pgtable *pgt, u64 addr,
> >  	return ret;
> >  }
> >
> > +static int stage2_sync_dirty_walker(const struct kvm_pgtable_visit_ctx
> *ctx,
> > +				    enum kvm_pgtable_walk_flags visit)
> > +{
> > +	kvm_pte_t pte = READ_ONCE(*ctx->ptep);
> 
> ctx->old is fetched in __kvm_pgtable_visit(), and the whole
> parallelization scheme for page table walkers depends on us being
> careful to read the PTE once per level. Do you need to reread it here?

Nope. That's a mistake. Will change that.

Thanks,
Shameer
Catalin Marinas Sept. 22, 2023, 3:24 p.m. UTC | #3
On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote:
> +static bool stage2_pte_writeable(kvm_pte_t pte)
> +{
> +	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
> +}
> +
> +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx,
> +			      kvm_pte_t new)
> +{
> +	kvm_pte_t old_pte, pte = ctx->old;
> +
> +	/* Only set DBM if page is writeable */
> +	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte))
> +		return;
> +
> +	/* Clear DBM walk is not shared, update */
> +	if (!kvm_pgtable_walk_shared(ctx)) {
> +		WRITE_ONCE(*ctx->ptep, new);
> +		return;
> +	}

I was wondering if this interferes with the OS dirty tracking (not the
KVM one) but I think that's ok, at least at this point, since the PTE is
already writeable and a fault would have marked the underlying page as
dirty (user_mem_abort() -> kvm_set_pfn_dirty()).

I'm not particularly fond of relying on this but I need to see how it
fits with the rest of the series. IIRC KVM doesn't go around and make
Stage 2 PTEs read-only but rather unmaps them when it changes the
permission of the corresponding Stage 1 VMM mapping.

My personal preference would be to track dirty/clean properly as we do
for stage 1 (e.g. DBM means writeable PTE) but it has some downsides
like the try_to_unmap() code having to retrieve the dirty state via
notifiers.

Anyway, assuming this works correctly, it means that live migration via
DBM is only tracked for PTEs already made dirty/writeable by some guest
write.

> @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
>  	    stage2_pte_executable(new))
>  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
>  
> +	/* Save the possible hardware dirty info */
> +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> +	    stage2_pte_writeable(ctx->old))
> +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT);
> +
>  	stage2_make_pte(ctx, new);

Isn't this racy and potentially losing the dirty state? Or is the 'new'
value guaranteed to have the S2AP[1] bit? For stage 1 we normally make
the page genuinely read-only (clearing DBM) in a cmpxchg loop to
preserve the dirty state (see ptep_set_wrprotect()).
Oliver Upton Sept. 22, 2023, 5:49 p.m. UTC | #4
On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote:
> On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote:
> > +static bool stage2_pte_writeable(kvm_pte_t pte)
> > +{
> > +	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
> > +}
> > +
> > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx,
> > +			      kvm_pte_t new)
> > +{
> > +	kvm_pte_t old_pte, pte = ctx->old;
> > +
> > +	/* Only set DBM if page is writeable */
> > +	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte))
> > +		return;
> > +
> > +	/* Clear DBM walk is not shared, update */
> > +	if (!kvm_pgtable_walk_shared(ctx)) {
> > +		WRITE_ONCE(*ctx->ptep, new);
> > +		return;
> > +	}
> 
> I was wondering if this interferes with the OS dirty tracking (not the
> KVM one) but I think that's ok, at least at this point, since the PTE is
> already writeable and a fault would have marked the underlying page as
> dirty (user_mem_abort() -> kvm_set_pfn_dirty()).
> 
> I'm not particularly fond of relying on this but I need to see how it
> fits with the rest of the series. IIRC KVM doesn't go around and make
> Stage 2 PTEs read-only but rather unmaps them when it changes the
> permission of the corresponding Stage 1 VMM mapping.
> 
> My personal preference would be to track dirty/clean properly as we do
> for stage 1 (e.g. DBM means writeable PTE) but it has some downsides
> like the try_to_unmap() code having to retrieve the dirty state via
> notifiers.

KVM's usage of DBM is complicated by the fact that the dirty log
interface w/ userspace is at PTE granularity. We only want the page
table walker to relax PTEs, but take faults on hugepages so we can do
page splitting.

> Anyway, assuming this works correctly, it means that live migration via
> DBM is only tracked for PTEs already made dirty/writeable by some guest
> write.

I'm hoping that we move away from this combined write-protection and DBM
scheme and only use a single dirty tracking strategy at a time.

> > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
> >  	    stage2_pte_executable(new))
> >  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
> >  
> > +	/* Save the possible hardware dirty info */
> > +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> > +	    stage2_pte_writeable(ctx->old))
> > +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT);
> > +
> >  	stage2_make_pte(ctx, new);
> 
> Isn't this racy and potentially losing the dirty state? Or is the 'new'
> value guaranteed to have the S2AP[1] bit? For stage 1 we normally make
> the page genuinely read-only (clearing DBM) in a cmpxchg loop to
> preserve the dirty state (see ptep_set_wrprotect()).

stage2_try_break_pte() a few lines up does a cmpxchg() and full
break-before-make, so at this point there shouldn't be a race with
either software or hardware table walkers.

But I'm still confused by this one. KVM only goes down the map
walker path (in the context of dirty tracking) if:

 - We took a translation fault

 - We took a write permission fault on a hugepage and need to split

In both cases the 'old' translation should have DBM cleared. Even if the
PTE were dirty, this is wasted work since we need to do a final scan of
the stage-2 when userspace collects the dirty log.

Am I missing something?
Shameerali Kolothum Thodi Sept. 25, 2023, 8:04 a.m. UTC | #5
> -----Original Message-----
> From: Oliver Upton [mailto:oliver.upton@linux.dev]
> Sent: 22 September 2023 18:49
> To: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> kvmarm@lists.linux.dev; kvm@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; maz@kernel.org; will@kernel.org;
> james.morse@arm.com; suzuki.poulose@arm.com; yuzenghui
> <yuzenghui@huawei.com>; zhukeqian <zhukeqian1@huawei.com>; Jonathan
> Cameron <jonathan.cameron@huawei.com>; Linuxarm
> <linuxarm@huawei.com>
> Subject: Re: [RFC PATCH v2 3/8] KVM: arm64: Add some HW_DBM related
> pgtable interfaces
> 
> On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote:
> > On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote:
> > > +static bool stage2_pte_writeable(kvm_pte_t pte)
> > > +{
> > > +	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
> > > +}
> > > +
> > > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx
> *ctx,
> > > +			      kvm_pte_t new)
> > > +{
> > > +	kvm_pte_t old_pte, pte = ctx->old;
> > > +
> > > +	/* Only set DBM if page is writeable */
> > > +	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM)
> && !stage2_pte_writeable(pte))
> > > +		return;
> > > +
> > > +	/* Clear DBM walk is not shared, update */
> > > +	if (!kvm_pgtable_walk_shared(ctx)) {
> > > +		WRITE_ONCE(*ctx->ptep, new);
> > > +		return;
> > > +	}
> >
> > I was wondering if this interferes with the OS dirty tracking (not the
> > KVM one) but I think that's ok, at least at this point, since the PTE is
> > already writeable and a fault would have marked the underlying page as
> > dirty (user_mem_abort() -> kvm_set_pfn_dirty()).
> >
> > I'm not particularly fond of relying on this but I need to see how it
> > fits with the rest of the series. IIRC KVM doesn't go around and make
> > Stage 2 PTEs read-only but rather unmaps them when it changes the
> > permission of the corresponding Stage 1 VMM mapping.
> >
> > My personal preference would be to track dirty/clean properly as we do
> > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides
> > like the try_to_unmap() code having to retrieve the dirty state via
> > notifiers.
> 
> KVM's usage of DBM is complicated by the fact that the dirty log
> interface w/ userspace is at PTE granularity. We only want the page
> table walker to relax PTEs, but take faults on hugepages so we can do
> page splitting.
> 
> > Anyway, assuming this works correctly, it means that live migration via
> > DBM is only tracked for PTEs already made dirty/writeable by some guest
> > write.
> 
> I'm hoping that we move away from this combined write-protection and
> DBM
> scheme and only use a single dirty tracking strategy at a time.

Yes. As mentioned in the cover letter this is a combined approach where we only set
DBM for near-by pages(64) on page fault when migration is started.

> 
> > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const
> struct kvm_pgtable_visit_ctx *ctx,
> > >  	    stage2_pte_executable(new))
> > >  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops),
> granule);
> > >
> > > +	/* Save the possible hardware dirty info */
> > > +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> > > +	    stage2_pte_writeable(ctx->old))
> > > +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu),
> ctx->addr >> PAGE_SHIFT);
> > > +
> > >  	stage2_make_pte(ctx, new);
> >
> > Isn't this racy and potentially losing the dirty state? Or is the 'new'
> > value guaranteed to have the S2AP[1] bit? For stage 1 we normally make
> > the page genuinely read-only (clearing DBM) in a cmpxchg loop to
> > preserve the dirty state (see ptep_set_wrprotect()).
> 
> stage2_try_break_pte() a few lines up does a cmpxchg() and full
> break-before-make, so at this point there shouldn't be a race with
> either software or hardware table walkers.
> 
> But I'm still confused by this one. KVM only goes down the map
> walker path (in the context of dirty tracking) if:
> 
>  - We took a translation fault
> 
>  - We took a write permission fault on a hugepage and need to split

Agree.

> In both cases the 'old' translation should have DBM cleared. Even if the
> PTE were dirty, this is wasted work since we need to do a final scan of
> the stage-2 when userspace collects the dirty log.
> 
> Am I missing something?

I think we can get rid of the above mark_page_dirty(). I will test it to confirm
we are not missing anything here.
 
Thanks,
Shameer
Catalin Marinas Sept. 26, 2023, 3:20 p.m. UTC | #6
On Mon, Sep 25, 2023 at 08:04:39AM +0000, Shameerali Kolothum Thodi wrote:
> From: Oliver Upton [mailto:oliver.upton@linux.dev]
> > On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote:
> > > I was wondering if this interferes with the OS dirty tracking (not the
> > > KVM one) but I think that's ok, at least at this point, since the PTE is
> > > already writeable and a fault would have marked the underlying page as
> > > dirty (user_mem_abort() -> kvm_set_pfn_dirty()).
> > >
> > > I'm not particularly fond of relying on this but I need to see how it
> > > fits with the rest of the series. IIRC KVM doesn't go around and make
> > > Stage 2 PTEs read-only but rather unmaps them when it changes the
> > > permission of the corresponding Stage 1 VMM mapping.
> > >
> > > My personal preference would be to track dirty/clean properly as we do
> > > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides
> > > like the try_to_unmap() code having to retrieve the dirty state via
> > > notifiers.
> > 
> > KVM's usage of DBM is complicated by the fact that the dirty log
> > interface w/ userspace is at PTE granularity. We only want the page
> > table walker to relax PTEs, but take faults on hugepages so we can do
> > page splitting.

Thanks for the clarification.

> > > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
> > > >  	    stage2_pte_executable(new))
> > > >  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
> > > >
> > > > +	/* Save the possible hardware dirty info */
> > > > +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> > > > +	    stage2_pte_writeable(ctx->old))
> > > > +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT);
> > > > +
> > > >  	stage2_make_pte(ctx, new);
> > >
> > > Isn't this racy and potentially losing the dirty state? Or is the 'new'
> > > value guaranteed to have the S2AP[1] bit? For stage 1 we normally make
> > > the page genuinely read-only (clearing DBM) in a cmpxchg loop to
> > > preserve the dirty state (see ptep_set_wrprotect()).
> > 
> > stage2_try_break_pte() a few lines up does a cmpxchg() and full
> > break-before-make, so at this point there shouldn't be a race with
> > either software or hardware table walkers.

Ah, I missed this. Also it was unrelated to this patch (or rather not
introduced by this patch).

> > In both cases the 'old' translation should have DBM cleared. Even if the
> > PTE were dirty, this is wasted work since we need to do a final scan of
> > the stage-2 when userspace collects the dirty log.
> > 
> > Am I missing something?
> 
> I think we can get rid of the above mark_page_dirty(). I will test it to confirm
> we are not missing anything here.

Is this the case for the other places of mark_page_dirty() in your
patches? If stage2_pte_writeable() is true, it must have been made
writeable earlier by a fault and the underlying page marked as dirty.
Shameerali Kolothum Thodi Sept. 26, 2023, 3:52 p.m. UTC | #7
> -----Original Message-----
> From: Catalin Marinas [mailto:catalin.marinas@arm.com]
> Sent: 26 September 2023 16:20
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Oliver Upton <oliver.upton@linux.dev>; kvmarm@lists.linux.dev;
> kvm@vger.kernel.org; linux-arm-kernel@lists.infradead.org; maz@kernel.org;
> will@kernel.org; james.morse@arm.com; suzuki.poulose@arm.com;
> yuzenghui <yuzenghui@huawei.com>; zhukeqian
> <zhukeqian1@huawei.com>; Jonathan Cameron
> <jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com>
> Subject: Re: [RFC PATCH v2 3/8] KVM: arm64: Add some HW_DBM related
> pgtable interfaces
> 
> On Mon, Sep 25, 2023 at 08:04:39AM +0000, Shameerali Kolothum Thodi
> wrote:
> > From: Oliver Upton [mailto:oliver.upton@linux.dev]
> > > On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote:
> > > > I was wondering if this interferes with the OS dirty tracking (not the
> > > > KVM one) but I think that's ok, at least at this point, since the PTE is
> > > > already writeable and a fault would have marked the underlying page
> as
> > > > dirty (user_mem_abort() -> kvm_set_pfn_dirty()).
> > > >
> > > > I'm not particularly fond of relying on this but I need to see how it
> > > > fits with the rest of the series. IIRC KVM doesn't go around and make
> > > > Stage 2 PTEs read-only but rather unmaps them when it changes the
> > > > permission of the corresponding Stage 1 VMM mapping.
> > > >
> > > > My personal preference would be to track dirty/clean properly as we do
> > > > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides
> > > > like the try_to_unmap() code having to retrieve the dirty state via
> > > > notifiers.
> > >
> > > KVM's usage of DBM is complicated by the fact that the dirty log
> > > interface w/ userspace is at PTE granularity. We only want the page
> > > table walker to relax PTEs, but take faults on hugepages so we can do
> > > page splitting.
> 
> Thanks for the clarification.
> 
> > > > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const
> struct kvm_pgtable_visit_ctx *ctx,
> > > > >  	    stage2_pte_executable(new))
> > > > >  		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops),
> granule);
> > > > >
> > > > > +	/* Save the possible hardware dirty info */
> > > > > +	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
> > > > > +	    stage2_pte_writeable(ctx->old))
> > > > > +		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu),
> ctx->addr >> PAGE_SHIFT);
> > > > > +
> > > > >  	stage2_make_pte(ctx, new);
> > > >
> > > > Isn't this racy and potentially losing the dirty state? Or is the 'new'
> > > > value guaranteed to have the S2AP[1] bit? For stage 1 we normally
> make
> > > > the page genuinely read-only (clearing DBM) in a cmpxchg loop to
> > > > preserve the dirty state (see ptep_set_wrprotect()).
> > >
> > > stage2_try_break_pte() a few lines up does a cmpxchg() and full
> > > break-before-make, so at this point there shouldn't be a race with
> > > either software or hardware table walkers.
> 
> Ah, I missed this. Also it was unrelated to this patch (or rather not
> introduced by this patch).
> 
> > > In both cases the 'old' translation should have DBM cleared. Even if the
> > > PTE were dirty, this is wasted work since we need to do a final scan of
> > > the stage-2 when userspace collects the dirty log.
> > >
> > > Am I missing something?
> >
> > I think we can get rid of the above mark_page_dirty(). I will test it to
> confirm
> > we are not missing anything here.
> 
> Is this the case for the other places of mark_page_dirty() in your
> patches? If stage2_pte_writeable() is true, it must have been made
> writeable earlier by a fault and the underlying page marked as dirty.
> 

One of the other place we have mark_page_dirty() is in the stage2_unmap_walker().
And during the testing of this series, I have tried to remove that and
found out that it actually causes memory corruption during VM migration.

From my old debug logs:

[  399.288076]  stage2_unmap_walker+0x270/0x284
[  399.288078]  __kvm_pgtable_walk+0x1ec/0x2cc
[  399.288081]  __kvm_pgtable_walk+0xec/0x2cc
[  399.288084]  __kvm_pgtable_walk+0xec/0x2cc
[  399.288086]  kvm_pgtable_walk+0xcc/0x160
[  399.288088]  kvm_pgtable_stage2_unmap+0x4c/0xbc
[  399.288091]  stage2_apply_range+0xd0/0xec
[  399.288094]  __unmap_stage2_range+0x2c/0x60
[  399.288096]  kvm_unmap_gfn_range+0x30/0x48
[  399.288099]  kvm_mmu_notifier_invalidate_range_start+0xe0/0x264
[  399.288103]  __mmu_notifier_invalidate_range_start+0xa4/0x23c
[  399.288106]  change_protection+0x638/0x900
[  399.288109]  change_prot_numa+0x64/0xfc
[  399.288113]  task_numa_work+0x2ac/0x450
[  399.288117]  task_work_run+0x78/0xd0

It looks like that the unmap path gets triggered from Numa page migration code
path, so we may need it there.

Thanks,
Shameer
Catalin Marinas Sept. 26, 2023, 4:37 p.m. UTC | #8
On Tue, Sep 26, 2023 at 03:52:19PM +0000, Shameerali Kolothum Thodi wrote:
> From: Catalin Marinas [mailto:catalin.marinas@arm.com]
> > On Mon, Sep 25, 2023 at 08:04:39AM +0000, Shameerali Kolothum Thodi wrote:
> > > From: Oliver Upton [mailto:oliver.upton@linux.dev]
> > > > In both cases the 'old' translation should have DBM cleared. Even if the
> > > > PTE were dirty, this is wasted work since we need to do a final scan of
> > > > the stage-2 when userspace collects the dirty log.
> > > >
> > > > Am I missing something?
> > >
> > > I think we can get rid of the above mark_page_dirty(). I will test it to confirm
> > > we are not missing anything here.
> > 
> > Is this the case for the other places of mark_page_dirty() in your
> > patches? If stage2_pte_writeable() is true, it must have been made
> > writeable earlier by a fault and the underlying page marked as dirty.
> > 
> 
> One of the other place we have mark_page_dirty() is in the stage2_unmap_walker().
> And during the testing of this series, I have tried to remove that and
> found out that it actually causes memory corruption during VM migration.
> 
> From my old debug logs:
> 
> [  399.288076]  stage2_unmap_walker+0x270/0x284
> [  399.288078]  __kvm_pgtable_walk+0x1ec/0x2cc
> [  399.288081]  __kvm_pgtable_walk+0xec/0x2cc
> [  399.288084]  __kvm_pgtable_walk+0xec/0x2cc
> [  399.288086]  kvm_pgtable_walk+0xcc/0x160
> [  399.288088]  kvm_pgtable_stage2_unmap+0x4c/0xbc
> [  399.288091]  stage2_apply_range+0xd0/0xec
> [  399.288094]  __unmap_stage2_range+0x2c/0x60
> [  399.288096]  kvm_unmap_gfn_range+0x30/0x48
> [  399.288099]  kvm_mmu_notifier_invalidate_range_start+0xe0/0x264
> [  399.288103]  __mmu_notifier_invalidate_range_start+0xa4/0x23c
> [  399.288106]  change_protection+0x638/0x900
> [  399.288109]  change_prot_numa+0x64/0xfc
> [  399.288113]  task_numa_work+0x2ac/0x450
> [  399.288117]  task_work_run+0x78/0xd0
> 
> It looks like that the unmap path gets triggered from Numa page migration code
> path, so we may need it there.

I think I confused things (should have looked at the code).
mark_page_dirty() is actually about marking the bitmap for dirty
tracking rather than the actual struct page. The latter is hidden
somewhere deep down the get_user_pages() code.

So yeah, I think we still need mark_page_dirty() in some places as to
avoid losing the dirty information with DBM being turned on.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 3f96bdd2086f..a12add002b89 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -578,6 +578,51 @@  int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
  */
 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
+/**
+ * kvm_pgtable_stage2_clear_dbm() - Clear DBM of guest stage-2 address range
+ *                                  without TLB invalidation (only last level).
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address from which to clear DBM,
+ * @size:	Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the disabled HW dirty are visible
+ * to the CPUs.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_set_dbm() - Set DBM of guest stage-2 address range to
+ *                                enable HW dirty (only last level).
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address from which to set DBM.
+ * @size:	Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_sync_dirty() - Sync HW dirty state into memslot.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address from which to sync.
+ * @size:	Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
 /**
  * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
  *                                  without TLB invalidation.
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 35f3c7959513..2ca600e3d637 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -68,6 +68,8 @@  KVM_NVHE_ALIAS(__hyp_stub_vectors);
 KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
 KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
 
+KVM_NVHE_ALIAS(mark_page_dirty);
+
 #ifdef CONFIG_ARM64_PSEUDO_NMI
 /* Static key checked in GIC_PRIO_IRQOFF. */
 KVM_NVHE_ALIAS(gic_nonsecure_priorities);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 1e65b8c97059..d7a46a00a7f6 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -9,6 +9,7 @@ 
 
 #include <linux/bitfield.h>
 #include <asm/kvm_pgtable.h>
+#include <asm/kvm_mmu.h>
 #include <asm/stage2_pgtable.h>
 
 
@@ -42,6 +43,7 @@ 
 
 #define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_HI_S2_DBM	BIT(51)
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
 
 #define KVM_PTE_LEAF_ATTR_HI_S1_GP	BIT(50)
@@ -764,8 +766,44 @@  static bool stage2_pte_is_locked(kvm_pte_t pte)
 	return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
 }
 
+static bool stage2_pte_writeable(kvm_pte_t pte)
+{
+	return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+}
+
+static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx,
+			      kvm_pte_t new)
+{
+	kvm_pte_t old_pte, pte = ctx->old;
+
+	/* Only set DBM if page is writeable */
+	if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte))
+		return;
+
+	/* Clear DBM walk is not shared, update */
+	if (!kvm_pgtable_walk_shared(ctx)) {
+		WRITE_ONCE(*ctx->ptep, new);
+		return;
+	}
+
+	do {
+		old_pte = pte;
+		pte = new;
+
+		if (old_pte == pte)
+			break;
+
+		pte = cmpxchg_relaxed(ctx->ptep, old_pte, pte);
+	} while (pte != old_pte);
+}
+
 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
 {
+	if (kvm_pgtable_walk_hw_dbm(ctx)) {
+		kvm_update_hw_dbm(ctx, new);
+		return true;
+	}
+
 	if (!kvm_pgtable_walk_shared(ctx)) {
 		WRITE_ONCE(*ctx->ptep, new);
 		return true;
@@ -952,6 +990,11 @@  static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
 	    stage2_pte_executable(new))
 		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
 
+	/* Save the possible hardware dirty info */
+	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
+	    stage2_pte_writeable(ctx->old))
+		mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT);
+
 	stage2_make_pte(ctx, new);
 
 	return 0;
@@ -1125,6 +1168,11 @@  static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 	 */
 	stage2_unmap_put_pte(ctx, mmu, mm_ops);
 
+	/* Save the possible hardware dirty info */
+	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) &&
+	    stage2_pte_writeable(ctx->old))
+		mark_page_dirty(kvm_s2_mmu_to_kvm(mmu), ctx->addr >> PAGE_SHIFT);
+
 	if (need_flush && mm_ops->dcache_clean_inval_poc)
 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
 					       kvm_granule_size(ctx->level));
@@ -1230,6 +1278,30 @@  static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 	return 0;
 }
 
+int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+	int ret;
+	u64 offset;
+
+	ret = stage2_update_leaf_attrs(pgt, addr, size, KVM_PTE_LEAF_ATTR_HI_S2_DBM, 0,
+				       NULL, NULL, KVM_PGTABLE_WALK_HW_DBM |
+				       KVM_PGTABLE_WALK_SHARED);
+	if (!ret)
+		return ret;
+
+	for (offset = 0; offset < size; offset += PAGE_SIZE)
+		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr + offset, 3);
+
+	return 0;
+}
+
+int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+	return stage2_update_leaf_attrs(pgt, addr, size,
+					0, KVM_PTE_LEAF_ATTR_HI_S2_DBM,
+					NULL, NULL, KVM_PGTABLE_WALK_HW_DBM);
+}
+
 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	return stage2_update_leaf_attrs(pgt, addr, size, 0,
@@ -1329,6 +1401,32 @@  int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 	return ret;
 }
 
+static int stage2_sync_dirty_walker(const struct kvm_pgtable_visit_ctx *ctx,
+				    enum kvm_pgtable_walk_flags visit)
+{
+	kvm_pte_t pte = READ_ONCE(*ctx->ptep);
+	struct kvm *kvm = ctx->arg;
+
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && stage2_pte_writeable(pte))
+		mark_page_dirty(kvm, ctx->addr >> PAGE_SHIFT);
+
+	return 0;
+}
+
+int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_sync_dirty_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg	= kvm_s2_mmu_to_kvm(pgt->mmu),
+	};
+
+	return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
 			       enum kvm_pgtable_walk_flags visit)
 {