Message ID | 20230825093528.1637-4-shameerali.kolothum.thodi@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: arm64: Implement SW/HW combined dirty log | expand |
Hi Shameer, On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote: > From: Keqian Zhu <zhukeqian1@huawei.com> > > This adds set_dbm, clear_dbm and sync_dirty interfaces in pgtable > layer. (1) set_dbm: Set DBM bit for last level PTE of a specified > range. TLBI is completed. (2) clear_dbm: Clear DBM bit for last > level PTE of a specified range. TLBI is not acted. (3) sync_dirty: > Scan last level PTE of a specific range. Log dirty if PTE is writeable. > > Besides, save the dirty state of PTE if it's invalided by map or > unmap. > > Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > --- > arch/arm64/include/asm/kvm_pgtable.h | 45 +++++++++++++ > arch/arm64/kernel/image-vars.h | 2 + > arch/arm64/kvm/hyp/pgtable.c | 98 ++++++++++++++++++++++++++++ > 3 files changed, 145 insertions(+) > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > index 3f96bdd2086f..a12add002b89 100644 > --- a/arch/arm64/include/asm/kvm_pgtable.h > +++ b/arch/arm64/include/asm/kvm_pgtable.h > @@ -578,6 +578,51 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, > */ > int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); > > +/** > + * kvm_pgtable_stage2_clear_dbm() - Clear DBM of guest stage-2 address range > + * without TLB invalidation (only last level). > + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). > + * @addr: Intermediate physical address from which to clear DBM, > + * @size: Size of the range. > + * > + * The offset of @addr within a page is ignored and @size is rounded-up to > + * the next page boundary. > + * > + * Note that it is the caller's responsibility to invalidate the TLB after > + * calling this function to ensure that the disabled HW dirty are visible > + * to the CPUs. > + * > + * Return: 0 on success, negative error code on failure. > + */ > +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size); > + > +/** > + * kvm_pgtable_stage2_set_dbm() - Set DBM of guest stage-2 address range to > + * enable HW dirty (only last level). > + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). > + * @addr: Intermediate physical address from which to set DBM. > + * @size: Size of the range. > + * > + * The offset of @addr within a page is ignored and @size is rounded-up to > + * the next page boundary. > + * > + * Return: 0 on success, negative error code on failure. > + */ > +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size); > + > +/** > + * kvm_pgtable_stage2_sync_dirty() - Sync HW dirty state into memslot. > + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). > + * @addr: Intermediate physical address from which to sync. > + * @size: Size of the range. > + * > + * The offset of @addr within a page is ignored and @size is rounded-up to > + * the next page boundary. > + * > + * Return: 0 on success, negative error code on failure. > + */ > +int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, u64 size); > + > /** > * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range > * without TLB invalidation. > diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h > index 35f3c7959513..2ca600e3d637 100644 > --- a/arch/arm64/kernel/image-vars.h > +++ b/arch/arm64/kernel/image-vars.h > @@ -68,6 +68,8 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); > KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); > KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); > > +KVM_NVHE_ALIAS(mark_page_dirty); > + This doesn't make any sense besides satisfying the linker. The hyp page table walkers will *never* need to mark a page as dirty. Consider adding a new function pointer to kvm_pgtable_mm_ops and only set it for the 'normal' stage-2 mm ops in mmu.c. > +static bool stage2_pte_writeable(kvm_pte_t pte) > +{ > + return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; > +} > + > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx, > + kvm_pte_t new) > +{ > + kvm_pte_t old_pte, pte = ctx->old; > + > + /* Only set DBM if page is writeable */ > + if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte)) > + return; > + > + /* Clear DBM walk is not shared, update */ > + if (!kvm_pgtable_walk_shared(ctx)) { > + WRITE_ONCE(*ctx->ptep, new); > + return; > + } > + > + do { > + old_pte = pte; > + pte = new; > + > + if (old_pte == pte) > + break; > + > + pte = cmpxchg_relaxed(ctx->ptep, old_pte, pte); > + } while (pte != old_pte); > +} > + We don't need a separate walker for updating the DBM state in the page tables. Can't we treat it like any other permission bit and use something like kvm_pgtable_stage2_relax_perms()? Also, what is the purpose of retrying the update on a descriptor if the cmpxchg() fails? Everywhere else in the page table walker code we bail and retry execution since that is a clear indication of a race. In all likelihood the other vCPU thread was either trying to update DBM or service a permission fault. > static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) > { > + if (kvm_pgtable_walk_hw_dbm(ctx)) { > + kvm_update_hw_dbm(ctx, new); > + return true; > + } > + Why are you trying to circumvent the primitive for setting stage-2 PTEs? > if (!kvm_pgtable_walk_shared(ctx)) { > WRITE_ONCE(*ctx->ptep, new); > return true; > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, > stage2_pte_executable(new)) > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); > > + /* Save the possible hardware dirty info */ > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > + stage2_pte_writeable(ctx->old)) > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT); > + > stage2_make_pte(ctx, new); > > return 0; > @@ -1125,6 +1168,11 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, > */ > stage2_unmap_put_pte(ctx, mmu, mm_ops); > > + /* Save the possible hardware dirty info */ > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > + stage2_pte_writeable(ctx->old)) > + mark_page_dirty(kvm_s2_mmu_to_kvm(mmu), ctx->addr >> PAGE_SHIFT); > + > if (need_flush && mm_ops->dcache_clean_inval_poc) > mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), > kvm_granule_size(ctx->level)); > @@ -1230,6 +1278,30 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, > return 0; > } > > +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size) > +{ > + int ret; > + u64 offset; > + > + ret = stage2_update_leaf_attrs(pgt, addr, size, KVM_PTE_LEAF_ATTR_HI_S2_DBM, 0, > + NULL, NULL, KVM_PGTABLE_WALK_HW_DBM | > + KVM_PGTABLE_WALK_SHARED); > + if (!ret) > + return ret; > + > + for (offset = 0; offset < size; offset += PAGE_SIZE) > + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr + offset, 3); > + > + return 0; > +} > + > +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size) > +{ > + return stage2_update_leaf_attrs(pgt, addr, size, > + 0, KVM_PTE_LEAF_ATTR_HI_S2_DBM, > + NULL, NULL, KVM_PGTABLE_WALK_HW_DBM); > +} > + > int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) > { > return stage2_update_leaf_attrs(pgt, addr, size, 0, > @@ -1329,6 +1401,32 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, > return ret; > } > > +static int stage2_sync_dirty_walker(const struct kvm_pgtable_visit_ctx *ctx, > + enum kvm_pgtable_walk_flags visit) > +{ > + kvm_pte_t pte = READ_ONCE(*ctx->ptep); ctx->old is fetched in __kvm_pgtable_visit(), and the whole parallelization scheme for page table walkers depends on us being careful to read the PTE once per level. Do you need to reread it here?
> -----Original Message----- > From: Oliver Upton [mailto:oliver.upton@linux.dev] > Sent: 15 September 2023 23:23 > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> > Cc: kvmarm@lists.linux.dev; kvm@vger.kernel.org; > linux-arm-kernel@lists.infradead.org; maz@kernel.org; will@kernel.org; > catalin.marinas@arm.com; james.morse@arm.com; > suzuki.poulose@arm.com; yuzenghui <yuzenghui@huawei.com>; zhukeqian > <zhukeqian1@huawei.com>; Jonathan Cameron > <jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com> > Subject: Re: [RFC PATCH v2 3/8] KVM: arm64: Add some HW_DBM related > pgtable interfaces > > Hi Shameer, > > On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote: > > From: Keqian Zhu <zhukeqian1@huawei.com> > > > > This adds set_dbm, clear_dbm and sync_dirty interfaces in pgtable > > layer. (1) set_dbm: Set DBM bit for last level PTE of a specified > > range. TLBI is completed. (2) clear_dbm: Clear DBM bit for last > > level PTE of a specified range. TLBI is not acted. (3) sync_dirty: > > Scan last level PTE of a specific range. Log dirty if PTE is writeable. > > > > Besides, save the dirty state of PTE if it's invalided by map or > > unmap. > > > > Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com> > > Signed-off-by: Shameer Kolothum > <shameerali.kolothum.thodi@huawei.com> > > --- > > arch/arm64/include/asm/kvm_pgtable.h | 45 +++++++++++++ > > arch/arm64/kernel/image-vars.h | 2 + > > arch/arm64/kvm/hyp/pgtable.c | 98 > ++++++++++++++++++++++++++++ > > 3 files changed, 145 insertions(+) > > > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h > b/arch/arm64/include/asm/kvm_pgtable.h > > index 3f96bdd2086f..a12add002b89 100644 > > --- a/arch/arm64/include/asm/kvm_pgtable.h > > +++ b/arch/arm64/include/asm/kvm_pgtable.h > > @@ -578,6 +578,51 @@ int kvm_pgtable_stage2_set_owner(struct > kvm_pgtable *pgt, u64 addr, u64 size, > > */ > > int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 > size); > > > > +/** > > + * kvm_pgtable_stage2_clear_dbm() - Clear DBM of guest stage-2 address > range > > + * without TLB invalidation (only > last level). > > + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). > > + * @addr: Intermediate physical address from which to clear DBM, > > + * @size: Size of the range. > > + * > > + * The offset of @addr within a page is ignored and @size is rounded-up > to > > + * the next page boundary. > > + * > > + * Note that it is the caller's responsibility to invalidate the TLB after > > + * calling this function to ensure that the disabled HW dirty are visible > > + * to the CPUs. > > + * > > + * Return: 0 on success, negative error code on failure. > > + */ > > +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, > u64 size); > > + > > +/** > > + * kvm_pgtable_stage2_set_dbm() - Set DBM of guest stage-2 address > range to > > + * enable HW dirty (only last level). > > + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). > > + * @addr: Intermediate physical address from which to set DBM. > > + * @size: Size of the range. > > + * > > + * The offset of @addr within a page is ignored and @size is rounded-up > to > > + * the next page boundary. > > + * > > + * Return: 0 on success, negative error code on failure. > > + */ > > +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 > size); > > + > > +/** > > + * kvm_pgtable_stage2_sync_dirty() - Sync HW dirty state into memslot. > > + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). > > + * @addr: Intermediate physical address from which to sync. > > + * @size: Size of the range. > > + * > > + * The offset of @addr within a page is ignored and @size is rounded-up > to > > + * the next page boundary. > > + * > > + * Return: 0 on success, negative error code on failure. > > + */ > > +int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, > u64 size); > > + > > /** > > * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 > address range > > * without TLB invalidation. > > diff --git a/arch/arm64/kernel/image-vars.h > b/arch/arm64/kernel/image-vars.h > > index 35f3c7959513..2ca600e3d637 100644 > > --- a/arch/arm64/kernel/image-vars.h > > +++ b/arch/arm64/kernel/image-vars.h > > @@ -68,6 +68,8 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); > > KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); > > KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); > > > > +KVM_NVHE_ALIAS(mark_page_dirty); > > + > > This doesn't make any sense besides satisfying the linker. The hyp page > table walkers will *never* need to mark a page as dirty. > > Consider adding a new function pointer to kvm_pgtable_mm_ops and only > set it for the 'normal' stage-2 mm ops in mmu.c. Yes, this is only to satisfy the linker for NVHE. I will add the function pointer to handle this. > > > +static bool stage2_pte_writeable(kvm_pte_t pte) > > +{ > > + return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; > > +} > > + > > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx, > > + kvm_pte_t new) > > +{ > > + kvm_pte_t old_pte, pte = ctx->old; > > + > > + /* Only set DBM if page is writeable */ > > + if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) > && !stage2_pte_writeable(pte)) > > + return; > > + > > + /* Clear DBM walk is not shared, update */ > > + if (!kvm_pgtable_walk_shared(ctx)) { > > + WRITE_ONCE(*ctx->ptep, new); > > + return; > > + } > > + > > + do { > > + old_pte = pte; > > + pte = new; > > + > > + if (old_pte == pte) > > + break; > > + > > + pte = cmpxchg_relaxed(ctx->ptep, old_pte, pte); > > + } while (pte != old_pte); > > +} > > + > > We don't need a separate walker for updating the DBM state in the > page tables. Can't we treat it like any other permission bit and use > something like kvm_pgtable_stage2_relax_perms()? I treated it separately to isolate it from rest of PTE updates for ease of debug/test initially. But yes, looks like we can treat it generally. > Also, what is the purpose of retrying the update on a descriptor if the > cmpxchg() fails? Everywhere else in the page table walker code we bail > and retry execution since that is a clear indication of a race. In all > likelihood the other vCPU thread was either trying to update DBM or > service a permission fault. Retry was added to avoid any situation where it will return without setting the DBM and we end up scanning for dirty pages irrespectively. I will add some debug info to see the number of races we are getting and its impact on performance. > > > static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, > kvm_pte_t new) > > { > > + if (kvm_pgtable_walk_hw_dbm(ctx)) { > > + kvm_update_hw_dbm(ctx, new); > > + return true; > > + } > > + > > Why are you trying to circumvent the primitive for setting stage-2 PTEs? Not sure it actually does that from a functionality point of view, as the flag _WALK_HW_DBM is only set from set_dbm/clear_dbm paths. But yes we can do better if we get rid of the above retry logic, I guess. > > > if (!kvm_pgtable_walk_shared(ctx)) { > > WRITE_ONCE(*ctx->ptep, new); > > return true; > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const > struct kvm_pgtable_visit_ctx *ctx, > > stage2_pte_executable(new)) > > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), > granule); > > > > + /* Save the possible hardware dirty info */ > > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > > + stage2_pte_writeable(ctx->old)) > > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> > PAGE_SHIFT); > > + > > stage2_make_pte(ctx, new); > > > > return 0; > > @@ -1125,6 +1168,11 @@ static int stage2_unmap_walker(const struct > kvm_pgtable_visit_ctx *ctx, > > */ > > stage2_unmap_put_pte(ctx, mmu, mm_ops); > > > > + /* Save the possible hardware dirty info */ > > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > > + stage2_pte_writeable(ctx->old)) > > + mark_page_dirty(kvm_s2_mmu_to_kvm(mmu), ctx->addr >> > PAGE_SHIFT); > > + > > if (need_flush && mm_ops->dcache_clean_inval_poc) > > mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, > mm_ops), > > kvm_granule_size(ctx->level)); > > @@ -1230,6 +1278,30 @@ static int stage2_update_leaf_attrs(struct > kvm_pgtable *pgt, u64 addr, > > return 0; > > } > > > > +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 > size) > > +{ > > + int ret; > > + u64 offset; > > + > > + ret = stage2_update_leaf_attrs(pgt, addr, size, > KVM_PTE_LEAF_ATTR_HI_S2_DBM, 0, > > + NULL, NULL, KVM_PGTABLE_WALK_HW_DBM | > > + KVM_PGTABLE_WALK_SHARED); > > + if (!ret) > > + return ret; > > + > > + for (offset = 0; offset < size; offset += PAGE_SIZE) > > + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr + > offset, 3); > > + > > + return 0; > > +} > > + > > +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, > u64 size) > > +{ > > + return stage2_update_leaf_attrs(pgt, addr, size, > > + 0, KVM_PTE_LEAF_ATTR_HI_S2_DBM, > > + NULL, NULL, KVM_PGTABLE_WALK_HW_DBM); > > +} > > + > > int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, > u64 size) > > { > > return stage2_update_leaf_attrs(pgt, addr, size, 0, > > @@ -1329,6 +1401,32 @@ int kvm_pgtable_stage2_relax_perms(struct > kvm_pgtable *pgt, u64 addr, > > return ret; > > } > > > > +static int stage2_sync_dirty_walker(const struct kvm_pgtable_visit_ctx > *ctx, > > + enum kvm_pgtable_walk_flags visit) > > +{ > > + kvm_pte_t pte = READ_ONCE(*ctx->ptep); > > ctx->old is fetched in __kvm_pgtable_visit(), and the whole > parallelization scheme for page table walkers depends on us being > careful to read the PTE once per level. Do you need to reread it here? Nope. That's a mistake. Will change that. Thanks, Shameer
On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote: > +static bool stage2_pte_writeable(kvm_pte_t pte) > +{ > + return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; > +} > + > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx, > + kvm_pte_t new) > +{ > + kvm_pte_t old_pte, pte = ctx->old; > + > + /* Only set DBM if page is writeable */ > + if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte)) > + return; > + > + /* Clear DBM walk is not shared, update */ > + if (!kvm_pgtable_walk_shared(ctx)) { > + WRITE_ONCE(*ctx->ptep, new); > + return; > + } I was wondering if this interferes with the OS dirty tracking (not the KVM one) but I think that's ok, at least at this point, since the PTE is already writeable and a fault would have marked the underlying page as dirty (user_mem_abort() -> kvm_set_pfn_dirty()). I'm not particularly fond of relying on this but I need to see how it fits with the rest of the series. IIRC KVM doesn't go around and make Stage 2 PTEs read-only but rather unmaps them when it changes the permission of the corresponding Stage 1 VMM mapping. My personal preference would be to track dirty/clean properly as we do for stage 1 (e.g. DBM means writeable PTE) but it has some downsides like the try_to_unmap() code having to retrieve the dirty state via notifiers. Anyway, assuming this works correctly, it means that live migration via DBM is only tracked for PTEs already made dirty/writeable by some guest write. > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, > stage2_pte_executable(new)) > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); > > + /* Save the possible hardware dirty info */ > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > + stage2_pte_writeable(ctx->old)) > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT); > + > stage2_make_pte(ctx, new); Isn't this racy and potentially losing the dirty state? Or is the 'new' value guaranteed to have the S2AP[1] bit? For stage 1 we normally make the page genuinely read-only (clearing DBM) in a cmpxchg loop to preserve the dirty state (see ptep_set_wrprotect()).
On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote: > On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote: > > +static bool stage2_pte_writeable(kvm_pte_t pte) > > +{ > > + return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; > > +} > > + > > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx, > > + kvm_pte_t new) > > +{ > > + kvm_pte_t old_pte, pte = ctx->old; > > + > > + /* Only set DBM if page is writeable */ > > + if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte)) > > + return; > > + > > + /* Clear DBM walk is not shared, update */ > > + if (!kvm_pgtable_walk_shared(ctx)) { > > + WRITE_ONCE(*ctx->ptep, new); > > + return; > > + } > > I was wondering if this interferes with the OS dirty tracking (not the > KVM one) but I think that's ok, at least at this point, since the PTE is > already writeable and a fault would have marked the underlying page as > dirty (user_mem_abort() -> kvm_set_pfn_dirty()). > > I'm not particularly fond of relying on this but I need to see how it > fits with the rest of the series. IIRC KVM doesn't go around and make > Stage 2 PTEs read-only but rather unmaps them when it changes the > permission of the corresponding Stage 1 VMM mapping. > > My personal preference would be to track dirty/clean properly as we do > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides > like the try_to_unmap() code having to retrieve the dirty state via > notifiers. KVM's usage of DBM is complicated by the fact that the dirty log interface w/ userspace is at PTE granularity. We only want the page table walker to relax PTEs, but take faults on hugepages so we can do page splitting. > Anyway, assuming this works correctly, it means that live migration via > DBM is only tracked for PTEs already made dirty/writeable by some guest > write. I'm hoping that we move away from this combined write-protection and DBM scheme and only use a single dirty tracking strategy at a time. > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, > > stage2_pte_executable(new)) > > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); > > > > + /* Save the possible hardware dirty info */ > > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > > + stage2_pte_writeable(ctx->old)) > > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT); > > + > > stage2_make_pte(ctx, new); > > Isn't this racy and potentially losing the dirty state? Or is the 'new' > value guaranteed to have the S2AP[1] bit? For stage 1 we normally make > the page genuinely read-only (clearing DBM) in a cmpxchg loop to > preserve the dirty state (see ptep_set_wrprotect()). stage2_try_break_pte() a few lines up does a cmpxchg() and full break-before-make, so at this point there shouldn't be a race with either software or hardware table walkers. But I'm still confused by this one. KVM only goes down the map walker path (in the context of dirty tracking) if: - We took a translation fault - We took a write permission fault on a hugepage and need to split In both cases the 'old' translation should have DBM cleared. Even if the PTE were dirty, this is wasted work since we need to do a final scan of the stage-2 when userspace collects the dirty log. Am I missing something?
> -----Original Message----- > From: Oliver Upton [mailto:oliver.upton@linux.dev] > Sent: 22 September 2023 18:49 > To: Catalin Marinas <catalin.marinas@arm.com> > Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>; > kvmarm@lists.linux.dev; kvm@vger.kernel.org; > linux-arm-kernel@lists.infradead.org; maz@kernel.org; will@kernel.org; > james.morse@arm.com; suzuki.poulose@arm.com; yuzenghui > <yuzenghui@huawei.com>; zhukeqian <zhukeqian1@huawei.com>; Jonathan > Cameron <jonathan.cameron@huawei.com>; Linuxarm > <linuxarm@huawei.com> > Subject: Re: [RFC PATCH v2 3/8] KVM: arm64: Add some HW_DBM related > pgtable interfaces > > On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote: > > On Fri, Aug 25, 2023 at 10:35:23AM +0100, Shameer Kolothum wrote: > > > +static bool stage2_pte_writeable(kvm_pte_t pte) > > > +{ > > > + return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; > > > +} > > > + > > > +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx > *ctx, > > > + kvm_pte_t new) > > > +{ > > > + kvm_pte_t old_pte, pte = ctx->old; > > > + > > > + /* Only set DBM if page is writeable */ > > > + if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) > && !stage2_pte_writeable(pte)) > > > + return; > > > + > > > + /* Clear DBM walk is not shared, update */ > > > + if (!kvm_pgtable_walk_shared(ctx)) { > > > + WRITE_ONCE(*ctx->ptep, new); > > > + return; > > > + } > > > > I was wondering if this interferes with the OS dirty tracking (not the > > KVM one) but I think that's ok, at least at this point, since the PTE is > > already writeable and a fault would have marked the underlying page as > > dirty (user_mem_abort() -> kvm_set_pfn_dirty()). > > > > I'm not particularly fond of relying on this but I need to see how it > > fits with the rest of the series. IIRC KVM doesn't go around and make > > Stage 2 PTEs read-only but rather unmaps them when it changes the > > permission of the corresponding Stage 1 VMM mapping. > > > > My personal preference would be to track dirty/clean properly as we do > > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides > > like the try_to_unmap() code having to retrieve the dirty state via > > notifiers. > > KVM's usage of DBM is complicated by the fact that the dirty log > interface w/ userspace is at PTE granularity. We only want the page > table walker to relax PTEs, but take faults on hugepages so we can do > page splitting. > > > Anyway, assuming this works correctly, it means that live migration via > > DBM is only tracked for PTEs already made dirty/writeable by some guest > > write. > > I'm hoping that we move away from this combined write-protection and > DBM > scheme and only use a single dirty tracking strategy at a time. Yes. As mentioned in the cover letter this is a combined approach where we only set DBM for near-by pages(64) on page fault when migration is started. > > > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const > struct kvm_pgtable_visit_ctx *ctx, > > > stage2_pte_executable(new)) > > > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), > granule); > > > > > > + /* Save the possible hardware dirty info */ > > > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > > > + stage2_pte_writeable(ctx->old)) > > > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), > ctx->addr >> PAGE_SHIFT); > > > + > > > stage2_make_pte(ctx, new); > > > > Isn't this racy and potentially losing the dirty state? Or is the 'new' > > value guaranteed to have the S2AP[1] bit? For stage 1 we normally make > > the page genuinely read-only (clearing DBM) in a cmpxchg loop to > > preserve the dirty state (see ptep_set_wrprotect()). > > stage2_try_break_pte() a few lines up does a cmpxchg() and full > break-before-make, so at this point there shouldn't be a race with > either software or hardware table walkers. > > But I'm still confused by this one. KVM only goes down the map > walker path (in the context of dirty tracking) if: > > - We took a translation fault > > - We took a write permission fault on a hugepage and need to split Agree. > In both cases the 'old' translation should have DBM cleared. Even if the > PTE were dirty, this is wasted work since we need to do a final scan of > the stage-2 when userspace collects the dirty log. > > Am I missing something? I think we can get rid of the above mark_page_dirty(). I will test it to confirm we are not missing anything here. Thanks, Shameer
On Mon, Sep 25, 2023 at 08:04:39AM +0000, Shameerali Kolothum Thodi wrote: > From: Oliver Upton [mailto:oliver.upton@linux.dev] > > On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote: > > > I was wondering if this interferes with the OS dirty tracking (not the > > > KVM one) but I think that's ok, at least at this point, since the PTE is > > > already writeable and a fault would have marked the underlying page as > > > dirty (user_mem_abort() -> kvm_set_pfn_dirty()). > > > > > > I'm not particularly fond of relying on this but I need to see how it > > > fits with the rest of the series. IIRC KVM doesn't go around and make > > > Stage 2 PTEs read-only but rather unmaps them when it changes the > > > permission of the corresponding Stage 1 VMM mapping. > > > > > > My personal preference would be to track dirty/clean properly as we do > > > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides > > > like the try_to_unmap() code having to retrieve the dirty state via > > > notifiers. > > > > KVM's usage of DBM is complicated by the fact that the dirty log > > interface w/ userspace is at PTE granularity. We only want the page > > table walker to relax PTEs, but take faults on hugepages so we can do > > page splitting. Thanks for the clarification. > > > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, > > > > stage2_pte_executable(new)) > > > > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); > > > > > > > > + /* Save the possible hardware dirty info */ > > > > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > > > > + stage2_pte_writeable(ctx->old)) > > > > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT); > > > > + > > > > stage2_make_pte(ctx, new); > > > > > > Isn't this racy and potentially losing the dirty state? Or is the 'new' > > > value guaranteed to have the S2AP[1] bit? For stage 1 we normally make > > > the page genuinely read-only (clearing DBM) in a cmpxchg loop to > > > preserve the dirty state (see ptep_set_wrprotect()). > > > > stage2_try_break_pte() a few lines up does a cmpxchg() and full > > break-before-make, so at this point there shouldn't be a race with > > either software or hardware table walkers. Ah, I missed this. Also it was unrelated to this patch (or rather not introduced by this patch). > > In both cases the 'old' translation should have DBM cleared. Even if the > > PTE were dirty, this is wasted work since we need to do a final scan of > > the stage-2 when userspace collects the dirty log. > > > > Am I missing something? > > I think we can get rid of the above mark_page_dirty(). I will test it to confirm > we are not missing anything here. Is this the case for the other places of mark_page_dirty() in your patches? If stage2_pte_writeable() is true, it must have been made writeable earlier by a fault and the underlying page marked as dirty.
> -----Original Message----- > From: Catalin Marinas [mailto:catalin.marinas@arm.com] > Sent: 26 September 2023 16:20 > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> > Cc: Oliver Upton <oliver.upton@linux.dev>; kvmarm@lists.linux.dev; > kvm@vger.kernel.org; linux-arm-kernel@lists.infradead.org; maz@kernel.org; > will@kernel.org; james.morse@arm.com; suzuki.poulose@arm.com; > yuzenghui <yuzenghui@huawei.com>; zhukeqian > <zhukeqian1@huawei.com>; Jonathan Cameron > <jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com> > Subject: Re: [RFC PATCH v2 3/8] KVM: arm64: Add some HW_DBM related > pgtable interfaces > > On Mon, Sep 25, 2023 at 08:04:39AM +0000, Shameerali Kolothum Thodi > wrote: > > From: Oliver Upton [mailto:oliver.upton@linux.dev] > > > On Fri, Sep 22, 2023 at 04:24:11PM +0100, Catalin Marinas wrote: > > > > I was wondering if this interferes with the OS dirty tracking (not the > > > > KVM one) but I think that's ok, at least at this point, since the PTE is > > > > already writeable and a fault would have marked the underlying page > as > > > > dirty (user_mem_abort() -> kvm_set_pfn_dirty()). > > > > > > > > I'm not particularly fond of relying on this but I need to see how it > > > > fits with the rest of the series. IIRC KVM doesn't go around and make > > > > Stage 2 PTEs read-only but rather unmaps them when it changes the > > > > permission of the corresponding Stage 1 VMM mapping. > > > > > > > > My personal preference would be to track dirty/clean properly as we do > > > > for stage 1 (e.g. DBM means writeable PTE) but it has some downsides > > > > like the try_to_unmap() code having to retrieve the dirty state via > > > > notifiers. > > > > > > KVM's usage of DBM is complicated by the fact that the dirty log > > > interface w/ userspace is at PTE granularity. We only want the page > > > table walker to relax PTEs, but take faults on hugepages so we can do > > > page splitting. > > Thanks for the clarification. > > > > > > @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const > struct kvm_pgtable_visit_ctx *ctx, > > > > > stage2_pte_executable(new)) > > > > > mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), > granule); > > > > > > > > > > + /* Save the possible hardware dirty info */ > > > > > + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && > > > > > + stage2_pte_writeable(ctx->old)) > > > > > + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), > ctx->addr >> PAGE_SHIFT); > > > > > + > > > > > stage2_make_pte(ctx, new); > > > > > > > > Isn't this racy and potentially losing the dirty state? Or is the 'new' > > > > value guaranteed to have the S2AP[1] bit? For stage 1 we normally > make > > > > the page genuinely read-only (clearing DBM) in a cmpxchg loop to > > > > preserve the dirty state (see ptep_set_wrprotect()). > > > > > > stage2_try_break_pte() a few lines up does a cmpxchg() and full > > > break-before-make, so at this point there shouldn't be a race with > > > either software or hardware table walkers. > > Ah, I missed this. Also it was unrelated to this patch (or rather not > introduced by this patch). > > > > In both cases the 'old' translation should have DBM cleared. Even if the > > > PTE were dirty, this is wasted work since we need to do a final scan of > > > the stage-2 when userspace collects the dirty log. > > > > > > Am I missing something? > > > > I think we can get rid of the above mark_page_dirty(). I will test it to > confirm > > we are not missing anything here. > > Is this the case for the other places of mark_page_dirty() in your > patches? If stage2_pte_writeable() is true, it must have been made > writeable earlier by a fault and the underlying page marked as dirty. > One of the other place we have mark_page_dirty() is in the stage2_unmap_walker(). And during the testing of this series, I have tried to remove that and found out that it actually causes memory corruption during VM migration. From my old debug logs: [ 399.288076] stage2_unmap_walker+0x270/0x284 [ 399.288078] __kvm_pgtable_walk+0x1ec/0x2cc [ 399.288081] __kvm_pgtable_walk+0xec/0x2cc [ 399.288084] __kvm_pgtable_walk+0xec/0x2cc [ 399.288086] kvm_pgtable_walk+0xcc/0x160 [ 399.288088] kvm_pgtable_stage2_unmap+0x4c/0xbc [ 399.288091] stage2_apply_range+0xd0/0xec [ 399.288094] __unmap_stage2_range+0x2c/0x60 [ 399.288096] kvm_unmap_gfn_range+0x30/0x48 [ 399.288099] kvm_mmu_notifier_invalidate_range_start+0xe0/0x264 [ 399.288103] __mmu_notifier_invalidate_range_start+0xa4/0x23c [ 399.288106] change_protection+0x638/0x900 [ 399.288109] change_prot_numa+0x64/0xfc [ 399.288113] task_numa_work+0x2ac/0x450 [ 399.288117] task_work_run+0x78/0xd0 It looks like that the unmap path gets triggered from Numa page migration code path, so we may need it there. Thanks, Shameer
On Tue, Sep 26, 2023 at 03:52:19PM +0000, Shameerali Kolothum Thodi wrote: > From: Catalin Marinas [mailto:catalin.marinas@arm.com] > > On Mon, Sep 25, 2023 at 08:04:39AM +0000, Shameerali Kolothum Thodi wrote: > > > From: Oliver Upton [mailto:oliver.upton@linux.dev] > > > > In both cases the 'old' translation should have DBM cleared. Even if the > > > > PTE were dirty, this is wasted work since we need to do a final scan of > > > > the stage-2 when userspace collects the dirty log. > > > > > > > > Am I missing something? > > > > > > I think we can get rid of the above mark_page_dirty(). I will test it to confirm > > > we are not missing anything here. > > > > Is this the case for the other places of mark_page_dirty() in your > > patches? If stage2_pte_writeable() is true, it must have been made > > writeable earlier by a fault and the underlying page marked as dirty. > > > > One of the other place we have mark_page_dirty() is in the stage2_unmap_walker(). > And during the testing of this series, I have tried to remove that and > found out that it actually causes memory corruption during VM migration. > > From my old debug logs: > > [ 399.288076] stage2_unmap_walker+0x270/0x284 > [ 399.288078] __kvm_pgtable_walk+0x1ec/0x2cc > [ 399.288081] __kvm_pgtable_walk+0xec/0x2cc > [ 399.288084] __kvm_pgtable_walk+0xec/0x2cc > [ 399.288086] kvm_pgtable_walk+0xcc/0x160 > [ 399.288088] kvm_pgtable_stage2_unmap+0x4c/0xbc > [ 399.288091] stage2_apply_range+0xd0/0xec > [ 399.288094] __unmap_stage2_range+0x2c/0x60 > [ 399.288096] kvm_unmap_gfn_range+0x30/0x48 > [ 399.288099] kvm_mmu_notifier_invalidate_range_start+0xe0/0x264 > [ 399.288103] __mmu_notifier_invalidate_range_start+0xa4/0x23c > [ 399.288106] change_protection+0x638/0x900 > [ 399.288109] change_prot_numa+0x64/0xfc > [ 399.288113] task_numa_work+0x2ac/0x450 > [ 399.288117] task_work_run+0x78/0xd0 > > It looks like that the unmap path gets triggered from Numa page migration code > path, so we may need it there. I think I confused things (should have looked at the code). mark_page_dirty() is actually about marking the bitmap for dirty tracking rather than the actual struct page. The latter is hidden somewhere deep down the get_user_pages() code. So yeah, I think we still need mark_page_dirty() in some places as to avoid losing the dirty information with DBM being turned on.
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 3f96bdd2086f..a12add002b89 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -578,6 +578,51 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, */ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); +/** + * kvm_pgtable_stage2_clear_dbm() - Clear DBM of guest stage-2 address range + * without TLB invalidation (only last level). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to clear DBM, + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the disabled HW dirty are visible + * to the CPUs. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_set_dbm() - Set DBM of guest stage-2 address range to + * enable HW dirty (only last level). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to set DBM. + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_sync_dirty() - Sync HW dirty state into memslot. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to sync. + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, u64 size); + /** * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range * without TLB invalidation. diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 35f3c7959513..2ca600e3d637 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -68,6 +68,8 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); +KVM_NVHE_ALIAS(mark_page_dirty); + #ifdef CONFIG_ARM64_PSEUDO_NMI /* Static key checked in GIC_PRIO_IRQOFF. */ KVM_NVHE_ALIAS(gic_nonsecure_priorities); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 1e65b8c97059..d7a46a00a7f6 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -9,6 +9,7 @@ #include <linux/bitfield.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_mmu.h> #include <asm/stage2_pgtable.h> @@ -42,6 +43,7 @@ #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S2_DBM BIT(51) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) @@ -764,8 +766,44 @@ static bool stage2_pte_is_locked(kvm_pte_t pte) return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); } +static bool stage2_pte_writeable(kvm_pte_t pte) +{ + return pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; +} + +static void kvm_update_hw_dbm(const struct kvm_pgtable_visit_ctx *ctx, + kvm_pte_t new) +{ + kvm_pte_t old_pte, pte = ctx->old; + + /* Only set DBM if page is writeable */ + if ((new & KVM_PTE_LEAF_ATTR_HI_S2_DBM) && !stage2_pte_writeable(pte)) + return; + + /* Clear DBM walk is not shared, update */ + if (!kvm_pgtable_walk_shared(ctx)) { + WRITE_ONCE(*ctx->ptep, new); + return; + } + + do { + old_pte = pte; + pte = new; + + if (old_pte == pte) + break; + + pte = cmpxchg_relaxed(ctx->ptep, old_pte, pte); + } while (pte != old_pte); +} + static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) { + if (kvm_pgtable_walk_hw_dbm(ctx)) { + kvm_update_hw_dbm(ctx, new); + return true; + } + if (!kvm_pgtable_walk_shared(ctx)) { WRITE_ONCE(*ctx->ptep, new); return true; @@ -952,6 +990,11 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); + /* Save the possible hardware dirty info */ + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && + stage2_pte_writeable(ctx->old)) + mark_page_dirty(kvm_s2_mmu_to_kvm(pgt->mmu), ctx->addr >> PAGE_SHIFT); + stage2_make_pte(ctx, new); return 0; @@ -1125,6 +1168,11 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, */ stage2_unmap_put_pte(ctx, mmu, mm_ops); + /* Save the possible hardware dirty info */ + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && + stage2_pte_writeable(ctx->old)) + mark_page_dirty(kvm_s2_mmu_to_kvm(mmu), ctx->addr >> PAGE_SHIFT); + if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); @@ -1230,6 +1278,30 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, return 0; } +int kvm_pgtable_stage2_set_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + int ret; + u64 offset; + + ret = stage2_update_leaf_attrs(pgt, addr, size, KVM_PTE_LEAF_ATTR_HI_S2_DBM, 0, + NULL, NULL, KVM_PGTABLE_WALK_HW_DBM | + KVM_PGTABLE_WALK_SHARED); + if (!ret) + return ret; + + for (offset = 0; offset < size; offset += PAGE_SIZE) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr + offset, 3); + + return 0; +} + +int kvm_pgtable_stage2_clear_dbm(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + return stage2_update_leaf_attrs(pgt, addr, size, + 0, KVM_PTE_LEAF_ATTR_HI_S2_DBM, + NULL, NULL, KVM_PGTABLE_WALK_HW_DBM); +} + int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, @@ -1329,6 +1401,32 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, return ret; } +static int stage2_sync_dirty_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + kvm_pte_t pte = READ_ONCE(*ctx->ptep); + struct kvm *kvm = ctx->arg; + + if (!kvm_pte_valid(pte)) + return 0; + + if ((ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) && stage2_pte_writeable(pte)) + mark_page_dirty(kvm, ctx->addr >> PAGE_SHIFT); + + return 0; +} + +int kvm_pgtable_stage2_sync_dirty(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_sync_dirty_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = kvm_s2_mmu_to_kvm(pgt->mmu), + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) {