diff mbox series

[v5,3/9] KVM: arm64: Relax locking for kvm_test_age_gfn and kvm_age_gfn

Message ID 20240611002145.2078921-4-jthoughton@google.com (mailing list archive)
State New
Headers show
Series mm: multi-gen LRU: Walk secondary MMU page tables while aging | expand

Commit Message

James Houghton June 11, 2024, 12:21 a.m. UTC
Replace the MMU write locks (taken in the memslot iteration loop) for
read locks.

Grabbing the read lock instead of the write lock is safe because the
only requirement we have is that the stage-2 page tables do not get
deallocated while we are walking them. The stage2_age_walker() callback
is safe to race with itself; update the comment to reflect the
synchronization change.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 arch/arm64/kvm/Kconfig       |  1 +
 arch/arm64/kvm/hyp/pgtable.c | 15 +++++++++------
 arch/arm64/kvm/mmu.c         | 26 ++++++++++++++++++++------
 3 files changed, 30 insertions(+), 12 deletions(-)

Comments

Oliver Upton June 11, 2024, 5:57 a.m. UTC | #1
On Tue, Jun 11, 2024 at 12:21:39AM +0000, James Houghton wrote:
> Replace the MMU write locks (taken in the memslot iteration loop) for
> read locks.
> 
> Grabbing the read lock instead of the write lock is safe because the
> only requirement we have is that the stage-2 page tables do not get
> deallocated while we are walking them. The stage2_age_walker() callback
> is safe to race with itself; update the comment to reflect the
> synchronization change.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  arch/arm64/kvm/Kconfig       |  1 +
>  arch/arm64/kvm/hyp/pgtable.c | 15 +++++++++------
>  arch/arm64/kvm/mmu.c         | 26 ++++++++++++++++++++------
>  3 files changed, 30 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
> index 58f09370d17e..7a1af8141c0e 100644
> --- a/arch/arm64/kvm/Kconfig
> +++ b/arch/arm64/kvm/Kconfig
> @@ -22,6 +22,7 @@ menuconfig KVM
>  	select KVM_COMMON
>  	select KVM_GENERIC_HARDWARE_ENABLING
>  	select KVM_GENERIC_MMU_NOTIFIER
> +	select KVM_MMU_NOTIFIER_YOUNG_LOCKLESS
>  	select HAVE_KVM_CPU_RELAX_INTERCEPT
>  	select KVM_MMIO
>  	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 9e2bbee77491..b1b0f7148cff 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -1319,10 +1319,10 @@ static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
>  	data->young = true;
>  
>  	/*
> -	 * stage2_age_walker() is always called while holding the MMU lock for
> -	 * write, so this will always succeed. Nonetheless, this deliberately
> -	 * follows the race detection pattern of the other stage-2 walkers in
> -	 * case the locking mechanics of the MMU notifiers is ever changed.
> +	 * This walk may not be exclusive; the PTE is permitted to change

s/may not/is not/

> +	 * from under us. If there is a race to update this PTE, then the
> +	 * GFN is most likely young, so failing to clear the AF is likely
> +	 * to be inconsequential.
>  	 */
>  	if (data->mkold && !stage2_try_set_pte(ctx, new))
>  		return -EAGAIN;
> @@ -1345,10 +1345,13 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
>  	struct kvm_pgtable_walker walker = {
>  		.cb		= stage2_age_walker,
>  		.arg		= &data,
> -		.flags		= KVM_PGTABLE_WALK_LEAF,
> +		.flags		= KVM_PGTABLE_WALK_LEAF |
> +				  KVM_PGTABLE_WALK_SHARED,
>  	};
> +	int r;
>  
> -	WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
> +	r = kvm_pgtable_walk(pgt, addr, size, &walker);
> +	WARN_ON(r && r != -EAGAIN);

I could've been more explicit last time around, could you please tone
this down to WARN_ON_ONCE() as well?

>  	return data.young;
>  }
>  
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 8bcab0cc3fe9..a62c27a347ed 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1773,25 +1773,39 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	u64 size = (range->end - range->start) << PAGE_SHIFT;
> +	bool young = false;
> +
> +	read_lock(&kvm->mmu_lock);
>  
>  	if (!kvm->arch.mmu.pgt)
>  		return false;

I'm guessing you meant to have 'goto out' here, since this early return
fails to drop the mmu_lock.
James Houghton June 11, 2024, 4:52 p.m. UTC | #2
On Mon, Jun 10, 2024 at 10:58 PM Oliver Upton <oliver.upton@linux.dev> wrote:
>
> On Tue, Jun 11, 2024 at 12:21:39AM +0000, James Houghton wrote:
> > Replace the MMU write locks (taken in the memslot iteration loop) for
> > read locks.
> >
> > Grabbing the read lock instead of the write lock is safe because the
> > only requirement we have is that the stage-2 page tables do not get
> > deallocated while we are walking them. The stage2_age_walker() callback
> > is safe to race with itself; update the comment to reflect the
> > synchronization change.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> >  arch/arm64/kvm/Kconfig       |  1 +
> >  arch/arm64/kvm/hyp/pgtable.c | 15 +++++++++------
> >  arch/arm64/kvm/mmu.c         | 26 ++++++++++++++++++++------
> >  3 files changed, 30 insertions(+), 12 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
> > index 58f09370d17e..7a1af8141c0e 100644
> > --- a/arch/arm64/kvm/Kconfig
> > +++ b/arch/arm64/kvm/Kconfig
> > @@ -22,6 +22,7 @@ menuconfig KVM
> >       select KVM_COMMON
> >       select KVM_GENERIC_HARDWARE_ENABLING
> >       select KVM_GENERIC_MMU_NOTIFIER
> > +     select KVM_MMU_NOTIFIER_YOUNG_LOCKLESS
> >       select HAVE_KVM_CPU_RELAX_INTERCEPT
> >       select KVM_MMIO
> >       select KVM_GENERIC_DIRTYLOG_READ_PROTECT
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 9e2bbee77491..b1b0f7148cff 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -1319,10 +1319,10 @@ static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >       data->young = true;
> >
> >       /*
> > -      * stage2_age_walker() is always called while holding the MMU lock for
> > -      * write, so this will always succeed. Nonetheless, this deliberately
> > -      * follows the race detection pattern of the other stage-2 walkers in
> > -      * case the locking mechanics of the MMU notifiers is ever changed.
> > +      * This walk may not be exclusive; the PTE is permitted to change
>
> s/may not/is not/

Will fix.

>
> > +      * from under us. If there is a race to update this PTE, then the
> > +      * GFN is most likely young, so failing to clear the AF is likely
> > +      * to be inconsequential.
> >        */
> >       if (data->mkold && !stage2_try_set_pte(ctx, new))
> >               return -EAGAIN;
> > @@ -1345,10 +1345,13 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
> >       struct kvm_pgtable_walker walker = {
> >               .cb             = stage2_age_walker,
> >               .arg            = &data,
> > -             .flags          = KVM_PGTABLE_WALK_LEAF,
> > +             .flags          = KVM_PGTABLE_WALK_LEAF |
> > +                               KVM_PGTABLE_WALK_SHARED,
> >       };
> > +     int r;
> >
> > -     WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
> > +     r = kvm_pgtable_walk(pgt, addr, size, &walker);
> > +     WARN_ON(r && r != -EAGAIN);
>
> I could've been more explicit last time around, could you please tone
> this down to WARN_ON_ONCE() as well?

Will do, thanks.

>
> >       return data.young;
> >  }
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 8bcab0cc3fe9..a62c27a347ed 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1773,25 +1773,39 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> >  {
> >       u64 size = (range->end - range->start) << PAGE_SHIFT;
> > +     bool young = false;
> > +
> > +     read_lock(&kvm->mmu_lock);
> >
> >       if (!kvm->arch.mmu.pgt)
> >               return false;
>
> I'm guessing you meant to have 'goto out' here, since this early return
> fails to drop the mmu_lock.

Ah sorry! Thanks for catching this.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 58f09370d17e..7a1af8141c0e 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,6 +22,7 @@  menuconfig KVM
 	select KVM_COMMON
 	select KVM_GENERIC_HARDWARE_ENABLING
 	select KVM_GENERIC_MMU_NOTIFIER
+	select KVM_MMU_NOTIFIER_YOUNG_LOCKLESS
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select KVM_MMIO
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 9e2bbee77491..b1b0f7148cff 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1319,10 +1319,10 @@  static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
 	data->young = true;
 
 	/*
-	 * stage2_age_walker() is always called while holding the MMU lock for
-	 * write, so this will always succeed. Nonetheless, this deliberately
-	 * follows the race detection pattern of the other stage-2 walkers in
-	 * case the locking mechanics of the MMU notifiers is ever changed.
+	 * This walk may not be exclusive; the PTE is permitted to change
+	 * from under us. If there is a race to update this PTE, then the
+	 * GFN is most likely young, so failing to clear the AF is likely
+	 * to be inconsequential.
 	 */
 	if (data->mkold && !stage2_try_set_pte(ctx, new))
 		return -EAGAIN;
@@ -1345,10 +1345,13 @@  bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_age_walker,
 		.arg		= &data,
-		.flags		= KVM_PGTABLE_WALK_LEAF,
+		.flags		= KVM_PGTABLE_WALK_LEAF |
+				  KVM_PGTABLE_WALK_SHARED,
 	};
+	int r;
 
-	WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+	r = kvm_pgtable_walk(pgt, addr, size, &walker);
+	WARN_ON(r && r != -EAGAIN);
 	return data.young;
 }
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8bcab0cc3fe9..a62c27a347ed 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1773,25 +1773,39 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	u64 size = (range->end - range->start) << PAGE_SHIFT;
+	bool young = false;
+
+	read_lock(&kvm->mmu_lock);
 
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
-	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
-						   range->start << PAGE_SHIFT,
-						   size, true);
+	young = kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+						    range->start << PAGE_SHIFT,
+						    size, true);
+
+out:
+	read_unlock(&kvm->mmu_lock);
+	return young;
 }
 
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	u64 size = (range->end - range->start) << PAGE_SHIFT;
+	bool young = false;
+
+	read_lock(&kvm->mmu_lock);
 
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
-	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
-						   range->start << PAGE_SHIFT,
-						   size, false);
+	young = kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+						    range->start << PAGE_SHIFT,
+						    size, false);
+
+out:
+	read_unlock(&kvm->mmu_lock);
+	return young;
 }
 
 phys_addr_t kvm_mmu_get_httbr(void)