diff mbox series

[RFC,10/17] KVM: arm64: Assume a table pte is already owned in post-order traversal

Message ID 20220415215901.1737897-11-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Parallelize stage 2 fault handling | expand

Commit Message

Oliver Upton April 15, 2022, 9:58 p.m. UTC
For parallel walks that collapse a table into a block KVM ensures a
locked invalid pte is visible to all observers in pre-order traversal.
As such, there is no need to try breaking the pte again.

Directly set the pte if it has already been broken.

Signed-off-by: Oliver Upton <oupton@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

Comments

Ben Gardon April 21, 2022, 4:11 p.m. UTC | #1
On Fri, Apr 15, 2022 at 2:59 PM Oliver Upton <oupton@google.com> wrote:
>
> For parallel walks that collapse a table into a block KVM ensures a
> locked invalid pte is visible to all observers in pre-order traversal.
> As such, there is no need to try breaking the pte again.

When you're doing the pre and post-order traversals, are they
implemented as separate traversals from the root, or is it a kind of
pre and post-order where non-leaf nodes are visited on the way down
and on the way up?
I assume either could be made to work, but the re-traversal from the
root probably minimizes TLB flushes, whereas the pre-and-post-order
would be a more efficient walk?

>
> Directly set the pte if it has already been broken.
>
> Signed-off-by: Oliver Upton <oupton@google.com>
> ---
>  arch/arm64/kvm/hyp/pgtable.c | 22 ++++++++++++++++------
>  1 file changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 146fc44acf31..121818d4c33e 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -924,7 +924,7 @@ static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
>  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>                                       kvm_pte_t *ptep, kvm_pte_t old,
>                                       struct stage2_map_data *data,
> -                                     bool shared)
> +                                     bool shared, bool locked)
>  {
>         kvm_pte_t new;
>         u64 granule = kvm_granule_size(level), phys = data->phys;
> @@ -948,7 +948,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>         if (!stage2_pte_needs_update(old, new))
>                 return -EAGAIN;
>
> -       if (!stage2_try_break_pte(ptep, old, addr, level, shared, data))
> +       if (!locked && !stage2_try_break_pte(ptep, old, addr, level, shared, data))
>                 return -EAGAIN;
>
>         /* Perform CMOs before installation of the guest stage-2 PTE */
> @@ -987,7 +987,8 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
>  }
>
>  static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> -                               kvm_pte_t *old, struct stage2_map_data *data, bool shared)
> +                               kvm_pte_t *old, struct stage2_map_data *data, bool shared,
> +                               bool locked)
>  {
>         struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
>         kvm_pte_t *childp, pte;
> @@ -998,10 +999,13 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>                 return 0;
>         }
>
> -       ret = stage2_map_walker_try_leaf(addr, end, level, ptep, *old, data, shared);
> +       ret = stage2_map_walker_try_leaf(addr, end, level, ptep, *old, data, shared, locked);
>         if (ret != -E2BIG)
>                 return ret;
>
> +       /* We should never attempt installing a table in post-order */
> +       WARN_ON(locked);
> +
>         if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
>                 return -EINVAL;
>
> @@ -1048,7 +1052,13 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
>                 childp = data->childp;
>                 data->anchor = NULL;
>                 data->childp = NULL;
> -               ret = stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared);
> +
> +               /*
> +                * We are guaranteed exclusive access to the pte in post-order
> +                * traversal since the locked value was made visible to all
> +                * observers in stage2_map_walk_table_pre.
> +                */
> +               ret = stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared, true);
>         } else {
>                 childp = kvm_pte_follow(*old, mm_ops);
>         }
> @@ -1087,7 +1097,7 @@ static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, kvm_
>         case KVM_PGTABLE_WALK_TABLE_PRE:
>                 return stage2_map_walk_table_pre(addr, end, level, ptep, old, data, shared);
>         case KVM_PGTABLE_WALK_LEAF:
> -               return stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared);
> +               return stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared, false);
>         case KVM_PGTABLE_WALK_TABLE_POST:
>                 return stage2_map_walk_table_post(addr, end, level, ptep, old, data, shared);
>         }
> --
> 2.36.0.rc0.470.gd361397f0d-goog
>
Oliver Upton April 21, 2022, 5:16 p.m. UTC | #2
On Thu, Apr 21, 2022 at 09:11:37AM -0700, Ben Gardon wrote:
> On Fri, Apr 15, 2022 at 2:59 PM Oliver Upton <oupton@google.com> wrote:
> >
> > For parallel walks that collapse a table into a block KVM ensures a
> > locked invalid pte is visible to all observers in pre-order traversal.
> > As such, there is no need to try breaking the pte again.
> 
> When you're doing the pre and post-order traversals, are they
> implemented as separate traversals from the root, or is it a kind of
> pre and post-order where non-leaf nodes are visited on the way down
> and on the way up?

The latter. We do one walk of the tables and fire the appropriate
visitor callbacks based on what part of the walk we're in.

> I assume either could be made to work, but the re-traversal from the
> root probably minimizes TLB flushes, whereas the pre-and-post-order
> would be a more efficient walk?

When we need to start doing operations on a whole range of memory this
way I completely agree (collapse to 2M, shatter to 4K for a memslot,
etc.).

For the current use cases of the stage 2 walker, to coalesce TLBIs we'd
need a better science around when to do blast all of stage 2 vs. TLBI with
an IPA argument. IOW, we go through a decent bit of trouble to avoid
flushing all of stage 2 unless deemed necessary. And the other unfortunate
thing about that is I doubt observations are portable between implementations
so the point where we cut over to a full flush is likely highly dependent
on the microarch.

Later revisions of the ARM architecture bring us TLBI instructions that
take a range argument, which could help a lot in this department.

--
Thanks,
Oliver
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 146fc44acf31..121818d4c33e 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -924,7 +924,7 @@  static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 				      kvm_pte_t *ptep, kvm_pte_t old,
 				      struct stage2_map_data *data,
-				      bool shared)
+				      bool shared, bool locked)
 {
 	kvm_pte_t new;
 	u64 granule = kvm_granule_size(level), phys = data->phys;
@@ -948,7 +948,7 @@  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	if (!stage2_pte_needs_update(old, new))
 		return -EAGAIN;
 
-	if (!stage2_try_break_pte(ptep, old, addr, level, shared, data))
+	if (!locked && !stage2_try_break_pte(ptep, old, addr, level, shared, data))
 		return -EAGAIN;
 
 	/* Perform CMOs before installation of the guest stage-2 PTE */
@@ -987,7 +987,8 @@  static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 }
 
 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
-				kvm_pte_t *old, struct stage2_map_data *data, bool shared)
+				kvm_pte_t *old, struct stage2_map_data *data, bool shared,
+				bool locked)
 {
 	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 	kvm_pte_t *childp, pte;
@@ -998,10 +999,13 @@  static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 		return 0;
 	}
 
-	ret = stage2_map_walker_try_leaf(addr, end, level, ptep, *old, data, shared);
+	ret = stage2_map_walker_try_leaf(addr, end, level, ptep, *old, data, shared, locked);
 	if (ret != -E2BIG)
 		return ret;
 
+	/* We should never attempt installing a table in post-order */
+	WARN_ON(locked);
+
 	if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
 		return -EINVAL;
 
@@ -1048,7 +1052,13 @@  static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 		childp = data->childp;
 		data->anchor = NULL;
 		data->childp = NULL;
-		ret = stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared);
+
+		/*
+		 * We are guaranteed exclusive access to the pte in post-order
+		 * traversal since the locked value was made visible to all
+		 * observers in stage2_map_walk_table_pre.
+		 */
+		ret = stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared, true);
 	} else {
 		childp = kvm_pte_follow(*old, mm_ops);
 	}
@@ -1087,7 +1097,7 @@  static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, kvm_
 	case KVM_PGTABLE_WALK_TABLE_PRE:
 		return stage2_map_walk_table_pre(addr, end, level, ptep, old, data, shared);
 	case KVM_PGTABLE_WALK_LEAF:
-		return stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared);
+		return stage2_map_walk_leaf(addr, end, level, ptep, old, data, shared, false);
 	case KVM_PGTABLE_WALK_TABLE_POST:
 		return stage2_map_walk_table_post(addr, end, level, ptep, old, data, shared);
 	}