diff mbox series

[v5,09/14] KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks

Message ID 20221107215644.1895162-10-oliver.upton@linux.dev (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Parallel stage-2 fault handling | expand

Commit Message

Oliver Upton Nov. 7, 2022, 9:56 p.m. UTC
The stage2 attr walker is already used for parallel walks. Since commit
f783ef1c0e82 ("KVM: arm64: Add fast path to handle permission relaxation
during dirty logging"), KVM acquires the read lock when
write-unprotecting a PTE. However, the walker only uses a simple store
to update the PTE. This is safe as the only possible race is with
hardware updates to the access flag, which is benign.

However, a subsequent change to KVM will allow more changes to the stage
2 page tables to be done in parallel. Prepare the stage 2 attribute
walker by performing atomic updates to the PTE when walking in parallel.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 arch/arm64/kvm/hyp/pgtable.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

Comments

Ben Gardon Nov. 9, 2022, 10:26 p.m. UTC | #1
On Mon, Nov 7, 2022 at 1:58 PM Oliver Upton <oliver.upton@linux.dev> wrote:
>
> The stage2 attr walker is already used for parallel walks. Since commit
> f783ef1c0e82 ("KVM: arm64: Add fast path to handle permission relaxation
> during dirty logging"), KVM acquires the read lock when
> write-unprotecting a PTE. However, the walker only uses a simple store
> to update the PTE. This is safe as the only possible race is with
> hardware updates to the access flag, which is benign.
>
> However, a subsequent change to KVM will allow more changes to the stage
> 2 page tables to be done in parallel. Prepare the stage 2 attribute
> walker by performing atomic updates to the PTE when walking in parallel.
>
> Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
> ---
>  arch/arm64/kvm/hyp/pgtable.c | 31 ++++++++++++++++++++++---------
>  1 file changed, 22 insertions(+), 9 deletions(-)
>
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index d8d963521d4e..a34e2050f931 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -185,7 +185,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
>                                       kvm_pteref_t pteref, u32 level)
>  {
>         enum kvm_pgtable_walk_flags flags = data->walker->flags;
> -       kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false);
> +       kvm_pte_t *ptep = kvm_dereference_pteref(pteref, flags & KVM_PGTABLE_WALK_SHARED);
>         struct kvm_pgtable_visit_ctx ctx = {
>                 .ptep   = ptep,
>                 .old    = READ_ONCE(*ptep),
> @@ -675,6 +675,16 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
>         return !!pte;
>  }
>
> +static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
> +{
> +       if (!kvm_pgtable_walk_shared(ctx)) {
> +               WRITE_ONCE(*ctx->ptep, new);
> +               return true;
> +       }
> +
> +       return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
> +}
> +
>  static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
>                            struct kvm_pgtable_mm_ops *mm_ops)
>  {
> @@ -986,7 +996,9 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
>                     stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
>                         mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
>                                                   kvm_granule_size(ctx->level));
> -               WRITE_ONCE(*ctx->ptep, pte);
> +
> +               if (!stage2_try_set_pte(ctx, pte))
> +                       return -EAGAIN;
>         }
>
>         return 0;
> @@ -995,7 +1007,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
>  static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
>                                     u64 size, kvm_pte_t attr_set,
>                                     kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
> -                                   u32 *level)
> +                                   u32 *level, enum kvm_pgtable_walk_flags flags)
>  {
>         int ret;
>         kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
> @@ -1006,7 +1018,7 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
>         struct kvm_pgtable_walker walker = {
>                 .cb             = stage2_attr_walker,
>                 .arg            = &data,
> -               .flags          = KVM_PGTABLE_WALK_LEAF,
> +               .flags          = flags | KVM_PGTABLE_WALK_LEAF,
>         };
>
>         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
> @@ -1025,14 +1037,14 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
>  {
>         return stage2_update_leaf_attrs(pgt, addr, size, 0,
>                                         KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
> -                                       NULL, NULL);
> +                                       NULL, NULL, 0);
>  }
>
>  kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
>  {
>         kvm_pte_t pte = 0;
>         stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
> -                                &pte, NULL);
> +                                &pte, NULL, 0);
>         dsb(ishst);
>         return pte;
>  }
> @@ -1041,7 +1053,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
>  {
>         kvm_pte_t pte = 0;
>         stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
> -                                &pte, NULL);
> +                                &pte, NULL, 0);
>         /*
>          * "But where's the TLBI?!", you scream.
>          * "Over in the core code", I sigh.
> @@ -1054,7 +1066,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
>  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
>  {
>         kvm_pte_t pte = 0;
> -       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
> +       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);

Would be nice to have an enum for KVM_PGTABLE_WALK_EXCLUSIVE so this
doesn't just have to pass 0.


>         return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
>  }
>
> @@ -1077,7 +1089,8 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
>         if (prot & KVM_PGTABLE_PROT_X)
>                 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
>
> -       ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
> +       ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
> +                                      KVM_PGTABLE_WALK_SHARED);
>         if (!ret)
>                 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
>         return ret;
> --
> 2.38.1.431.g37b22c650d-goog
>
Sean Christopherson Nov. 9, 2022, 10:42 p.m. UTC | #2
On Wed, Nov 09, 2022, Ben Gardon wrote:
> On Mon, Nov 7, 2022 at 1:58 PM Oliver Upton <oliver.upton@linux.dev> wrote:
> > @@ -1054,7 +1066,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
> >  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
> >  {
> >         kvm_pte_t pte = 0;
> > -       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
> > +       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
> 
> Would be nice to have an enum for KVM_PGTABLE_WALK_EXCLUSIVE so this
> doesn't just have to pass 0.

That's also dangerous though since the param is a set of flags, not unique,
arbitrary values.  E.g. this won't do the expected thing

	if (flags & KVM_PGTABLE_WALK_EXCLUSIVE)

I assume compilers would complain, but never say never when it comes to compilers :-)
Ben Gardon Nov. 9, 2022, 11 p.m. UTC | #3
On Wed, Nov 9, 2022 at 2:42 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Nov 09, 2022, Ben Gardon wrote:
> > On Mon, Nov 7, 2022 at 1:58 PM Oliver Upton <oliver.upton@linux.dev> wrote:
> > > @@ -1054,7 +1066,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
> > >  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
> > >  {
> > >         kvm_pte_t pte = 0;
> > > -       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
> > > +       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
> >
> > Would be nice to have an enum for KVM_PGTABLE_WALK_EXCLUSIVE so this
> > doesn't just have to pass 0.
>
> That's also dangerous though since the param is a set of flags, not unique,
> arbitrary values.  E.g. this won't do the expected thing
>
>         if (flags & KVM_PGTABLE_WALK_EXCLUSIVE)
>
> I assume compilers would complain, but never say never when it comes to compilers :-)

Yeah, I was thinking about that too. IMO using one enum for multiple
flags is kind of an abuse of the enum. If you're going to put multiple
orthogonal flags in an int or whatever, it would probably be best to
have separate enums for each flag. That way you can define masks to
extract the enum from the int and only compare with == and != as
opposed to using &.
Marc Zyngier Nov. 10, 2022, 1:40 p.m. UTC | #4
On Wed, 09 Nov 2022 23:00:16 +0000,
Ben Gardon <bgardon@google.com> wrote:
> 
> On Wed, Nov 9, 2022 at 2:42 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Wed, Nov 09, 2022, Ben Gardon wrote:
> > > On Mon, Nov 7, 2022 at 1:58 PM Oliver Upton <oliver.upton@linux.dev> wrote:
> > > > @@ -1054,7 +1066,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
> > > >  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
> > > >  {
> > > >         kvm_pte_t pte = 0;
> > > > -       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
> > > > +       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
> > >
> > > Would be nice to have an enum for KVM_PGTABLE_WALK_EXCLUSIVE so this
> > > doesn't just have to pass 0.
> >
> > That's also dangerous though since the param is a set of flags, not unique,
> > arbitrary values.  E.g. this won't do the expected thing
> >
> >         if (flags & KVM_PGTABLE_WALK_EXCLUSIVE)
> >
> > I assume compilers would complain, but never say never when it comes to compilers :-)
> 
> Yeah, I was thinking about that too. IMO using one enum for multiple
> flags is kind of an abuse of the enum. If you're going to put multiple
> orthogonal flags in an int or whatever, it would probably be best to
> have separate enums for each flag. That way you can define masks to
> extract the enum from the int and only compare with == and != as
> opposed to using &.

Too late. The kernel is filled of this (look at the irq code, for
example), and we happily use this construct all over the (oh wait!)
page table code to construct permissions and other things.

At this stage, this is an established construct. Compiler people can
try and break this habit, good luck to them ;-).

	M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index d8d963521d4e..a34e2050f931 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -185,7 +185,7 @@  static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
 				      kvm_pteref_t pteref, u32 level)
 {
 	enum kvm_pgtable_walk_flags flags = data->walker->flags;
-	kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false);
+	kvm_pte_t *ptep = kvm_dereference_pteref(pteref, flags & KVM_PGTABLE_WALK_SHARED);
 	struct kvm_pgtable_visit_ctx ctx = {
 		.ptep	= ptep,
 		.old	= READ_ONCE(*ptep),
@@ -675,6 +675,16 @@  static bool stage2_pte_is_counted(kvm_pte_t pte)
 	return !!pte;
 }
 
+static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
+{
+	if (!kvm_pgtable_walk_shared(ctx)) {
+		WRITE_ONCE(*ctx->ptep, new);
+		return true;
+	}
+
+	return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
+}
+
 static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
 			   struct kvm_pgtable_mm_ops *mm_ops)
 {
@@ -986,7 +996,9 @@  static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
 		    stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
 			mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
 						  kvm_granule_size(ctx->level));
-		WRITE_ONCE(*ctx->ptep, pte);
+
+		if (!stage2_try_set_pte(ctx, pte))
+			return -EAGAIN;
 	}
 
 	return 0;
@@ -995,7 +1007,7 @@  static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 				    u64 size, kvm_pte_t attr_set,
 				    kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
-				    u32 *level)
+				    u32 *level, enum kvm_pgtable_walk_flags flags)
 {
 	int ret;
 	kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
@@ -1006,7 +1018,7 @@  static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_attr_walker,
 		.arg		= &data,
-		.flags		= KVM_PGTABLE_WALK_LEAF,
+		.flags		= flags | KVM_PGTABLE_WALK_LEAF,
 	};
 
 	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
@@ -1025,14 +1037,14 @@  int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	return stage2_update_leaf_attrs(pgt, addr, size, 0,
 					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
-					NULL, NULL);
+					NULL, NULL, 0);
 }
 
 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;
 	stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
-				 &pte, NULL);
+				 &pte, NULL, 0);
 	dsb(ishst);
 	return pte;
 }
@@ -1041,7 +1053,7 @@  kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;
 	stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
-				 &pte, NULL);
+				 &pte, NULL, 0);
 	/*
 	 * "But where's the TLBI?!", you scream.
 	 * "Over in the core code", I sigh.
@@ -1054,7 +1066,7 @@  kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
 bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;
-	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
+	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
 	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
 }
 
@@ -1077,7 +1089,8 @@  int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 	if (prot & KVM_PGTABLE_PROT_X)
 		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
 
-	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
+	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
+				       KVM_PGTABLE_WALK_SHARED);
 	if (!ret)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
 	return ret;