diff mbox series

[v2,04/16] KVM: arm64: Optimize host memory aborts

Message ID 20210726092905.2198501-5-qperret@google.com (mailing list archive)
State New, archived
Headers show
Series Track shared pages at EL2 in protected mode | expand

Commit Message

Quentin Perret July 26, 2021, 9:28 a.m. UTC
The kvm_pgtable_stage2_find_range() function is used in the host memory
abort path to try and look for the largest block mapping that can be
used to map the faulting address. In order to do so, the function
currently walks the stage-2 page-table and looks for existing
incompatible mappings within the range of the largest possible block.
If incompatible mappings are found, it tries the same procedure again,
but using a smaller block range, and repeats until a matching range is
found (potentially up to page granularity). While this approach has
benefits (mostly in the fact that it proactively coalesces host stage-2
mappings), it can be slow if the ranges are fragmented, and it isn't
optimized to deal with CPUs faulting on the same IPA as all of them will
do all the work every time.

To avoid these issues, remove kvm_pgtable_stage2_find_range(), and walk
the page-table only once in the host_mem_abort() path to find the
closest leaf to the input address. With this, use the corresponding
range if it is invalid and not owned by another entity. If a valid leaf
is found, return -EAGAIN similar to what is done in the
kvm_pgtable_stage2_map() path to optimize concurrent faults.

Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h  | 30 -----------
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 39 +++++++++++++-
 arch/arm64/kvm/hyp/pgtable.c          | 74 ---------------------------
 3 files changed, 38 insertions(+), 105 deletions(-)

Comments

Marc Zyngier July 26, 2021, 10:35 a.m. UTC | #1
On Mon, 26 Jul 2021 10:28:53 +0100,
Quentin Perret <qperret@google.com> wrote:
> 
> The kvm_pgtable_stage2_find_range() function is used in the host memory
> abort path to try and look for the largest block mapping that can be
> used to map the faulting address. In order to do so, the function
> currently walks the stage-2 page-table and looks for existing
> incompatible mappings within the range of the largest possible block.
> If incompatible mappings are found, it tries the same procedure again,
> but using a smaller block range, and repeats until a matching range is
> found (potentially up to page granularity). While this approach has
> benefits (mostly in the fact that it proactively coalesces host stage-2
> mappings), it can be slow if the ranges are fragmented, and it isn't
> optimized to deal with CPUs faulting on the same IPA as all of them will
> do all the work every time.
> 
> To avoid these issues, remove kvm_pgtable_stage2_find_range(), and walk
> the page-table only once in the host_mem_abort() path to find the
> closest leaf to the input address. With this, use the corresponding
> range if it is invalid and not owned by another entity. If a valid leaf
> is found, return -EAGAIN similar to what is done in the
> kvm_pgtable_stage2_map() path to optimize concurrent faults.
> 
> Signed-off-by: Quentin Perret <qperret@google.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h  | 30 -----------
>  arch/arm64/kvm/hyp/nvhe/mem_protect.c | 39 +++++++++++++-
>  arch/arm64/kvm/hyp/pgtable.c          | 74 ---------------------------
>  3 files changed, 38 insertions(+), 105 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 5a7a13bbd4a1..cec76a49f521 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -141,16 +141,6 @@ enum kvm_pgtable_prot {
>  #define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
>  #define PAGE_HYP_DEVICE		(PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
>  
> -/**
> - * struct kvm_mem_range - Range of Intermediate Physical Addresses
> - * @start:	Start of the range.
> - * @end:	End of the range.
> - */
> -struct kvm_mem_range {
> -	u64 start;
> -	u64 end;
> -};
> -
>  /**
>   * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
>   * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
> @@ -477,24 +467,4 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
>   */
>  int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
>  			 kvm_pte_t *ptep, u32 *level);
> -
> -/**
> - * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
> - *				     Addresses with compatible permission
> - *				     attributes.
> - * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
> - * @addr:	Address that must be covered by the range.
> - * @prot:	Protection attributes that the range must be compatible with.
> - * @range:	Range structure used to limit the search space at call time and
> - *		that will hold the result.
> - *
> - * The offset of @addr within a page is ignored. An IPA is compatible with @prot
> - * iff its corresponding stage-2 page-table entry has default ownership and, if
> - * valid, is mapped with protection attributes identical to @prot.
> - *
> - * Return: 0 on success, negative error code on failure.
> - */
> -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
> -				  enum kvm_pgtable_prot prot,
> -				  struct kvm_mem_range *range);
>  #endif	/* __ARM64_KVM_PGTABLE_H__ */
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index 871149246f5f..01700a908bb7 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -159,6 +159,11 @@ static int host_stage2_unmap_dev_all(void)
>  	return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
>  }
>  
> +struct kvm_mem_range {
> +	u64 start;
> +	u64 end;
> +};
> +
>  static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
>  {
>  	int cur, left = 0, right = hyp_memblock_nr;
> @@ -227,6 +232,38 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
>  		__ret;							\
>  	 })
>
> +static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)

nit: I find 'find_range' a bit odd. We already have found a
range. We're just trying to narrow it down to something that fits in a
single block mapping. How about 'host_stage2_adjust_range'?

> +{
> +	u64 granule, start, end;
> +	kvm_pte_t pte;
> +	u32 level;
> +	int ret;
> +
> +	ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
> +	if (ret)
> +		return ret;
> +
> +	if (kvm_pte_valid(pte))
> +		return -EAGAIN;
> +
> +	if (pte)
> +		return -EPERM;
> +
> +	do {
> +		granule = kvm_granule_size(level);
> +		start = ALIGN_DOWN(addr, granule);
> +		end = start + granule;
> +		level++;
> +	} while ((level < KVM_PGTABLE_MAX_LEVELS) &&
> +			(!kvm_level_supports_block_mapping(level) ||
> +			 start < range->start || range->end < end));
> +

This expression does my head in. You are trying to find the largest
block mapping that entirely fits in range, right? Can we just express
that directly (with a global negation for the purpose of the loop)?

	do {
		[...]
	} while (level < KVM_PGTABLE_MAX_LEVELS &&
		 !(kvm_level_supports_block_mapping(level) &&
		   start >= range->start &&
		   end <= range->end));

I personally find this much more readable, because it expresses the
condition we are looking for rather than a lot of conditions forcing
us to continue.

You could also use a kvm_mem_range for the iteration, and add a helper
that checks for the inclusion.

> +	range->start = start;
> +	range->end = end;
> +
> +	return 0;
> +}
> +
>  static int host_stage2_idmap(u64 addr)
>  {
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
> @@ -238,7 +275,7 @@ static int host_stage2_idmap(u64 addr)
>  		prot |= KVM_PGTABLE_PROT_X;
>  
>  	hyp_spin_lock(&host_kvm.lock);
> -	ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
> +	ret = host_stage2_find_range(addr, &range);
>  	if (ret)
>  		goto unlock;

Thanks,

	M.
Quentin Perret July 26, 2021, 1:13 p.m. UTC | #2
On Monday 26 Jul 2021 at 11:35:10 (+0100), Marc Zyngier wrote:
> On Mon, 26 Jul 2021 10:28:53 +0100,
> Quentin Perret <qperret@google.com> wrote:
> > +static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)
> 
> nit: I find 'find_range' a bit odd. We already have found a
> range. We're just trying to narrow it down to something that fits in a
> single block mapping. How about 'host_stage2_adjust_range'?

Ack.

> > +{
> > +	u64 granule, start, end;
> > +	kvm_pte_t pte;
> > +	u32 level;
> > +	int ret;
> > +
> > +	ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (kvm_pte_valid(pte))
> > +		return -EAGAIN;
> > +
> > +	if (pte)
> > +		return -EPERM;
> > +
> > +	do {
> > +		granule = kvm_granule_size(level);
> > +		start = ALIGN_DOWN(addr, granule);
> > +		end = start + granule;
> > +		level++;
> > +	} while ((level < KVM_PGTABLE_MAX_LEVELS) &&
> > +			(!kvm_level_supports_block_mapping(level) ||
> > +			 start < range->start || range->end < end));
> > +
> 
> This expression does my head in. You are trying to find the largest
> block mapping that entirely fits in range, right? Can we just express
> that directly (with a global negation for the purpose of the loop)?
> 
> 	do {
> 		[...]
> 	} while (level < KVM_PGTABLE_MAX_LEVELS &&
> 		 !(kvm_level_supports_block_mapping(level) &&
> 		   start >= range->start &&
> 		   end <= range->end));
> 
> I personally find this much more readable, because it expresses the
> condition we are looking for rather than a lot of conditions forcing
> us to continue.
> 
> You could also use a kvm_mem_range for the iteration, and add a helper
> that checks for the inclusion.

Something like this (untested)?

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 75273166d2c5..07d228163090 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -234,9 +234,15 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
                __ret;                                                  \
         })

+static inline bool range_included(struct kvm_mem_range *child,
+                                 struct kvm_mem_range *parent)
+{
+       return parent->start <= child->start && child->end <= parent->end;
+}
+
 static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)
 {
-       u64 granule, start, end;
+       struct kvm_mem_range cur;
        kvm_pte_t pte;
        u32 level;
        int ret;
@@ -252,16 +258,15 @@ static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)
                return -EPERM;

        do {
-               granule = kvm_granule_size(level);
-               start = ALIGN_DOWN(addr, granule);
-               end = start + granule;
+               u64 granule = kvm_granule_size(level);
+               cur.start = ALIGN_DOWN(addr, granule);
+               cur.end = cur.start + granule;
                level++;
        } while ((level < KVM_PGTABLE_MAX_LEVELS) &&
-                       (!kvm_level_supports_block_mapping(level) ||
-                        start < range->start || range->end < end));
+                       !(kvm_level_supports_block_mapping(level) &&
+                         range_included(&cur, parent)));

-       range->start = start;
-       range->end = end;
+       *range = cur;

        return 0;
 }
Marc Zyngier July 26, 2021, 1:24 p.m. UTC | #3
On Mon, 26 Jul 2021 14:13:06 +0100,
Quentin Perret <qperret@google.com> wrote:
> 
> On Monday 26 Jul 2021 at 11:35:10 (+0100), Marc Zyngier wrote:

[...]

> > You could also use a kvm_mem_range for the iteration, and add a helper
> > that checks for the inclusion.
> 
> Something like this (untested)?
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index 75273166d2c5..07d228163090 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -234,9 +234,15 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
>                 __ret;                                                  \
>          })
> 
> +static inline bool range_included(struct kvm_mem_range *child,
> +                                 struct kvm_mem_range *parent)
> +{
> +       return parent->start <= child->start && child->end <= parent->end;
> +}
> +
>  static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)
>  {
> -       u64 granule, start, end;
> +       struct kvm_mem_range cur;
>         kvm_pte_t pte;
>         u32 level;
>         int ret;
> @@ -252,16 +258,15 @@ static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)
>                 return -EPERM;
> 
>         do {
> -               granule = kvm_granule_size(level);
> -               start = ALIGN_DOWN(addr, granule);
> -               end = start + granule;
> +               u64 granule = kvm_granule_size(level);
> +               cur.start = ALIGN_DOWN(addr, granule);
> +               cur.end = cur.start + granule;
>                 level++;
>         } while ((level < KVM_PGTABLE_MAX_LEVELS) &&
> -                       (!kvm_level_supports_block_mapping(level) ||
> -                        start < range->start || range->end < end));
> +                       !(kvm_level_supports_block_mapping(level) &&
> +                         range_included(&cur, parent)));
> 
> -       range->start = start;
> -       range->end = end;
> +       *range = cur;
> 
>         return 0;
>  }
> 

Beautiful.

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 5a7a13bbd4a1..cec76a49f521 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -141,16 +141,6 @@  enum kvm_pgtable_prot {
 #define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
 #define PAGE_HYP_DEVICE		(PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
 
-/**
- * struct kvm_mem_range - Range of Intermediate Physical Addresses
- * @start:	Start of the range.
- * @end:	End of the range.
- */
-struct kvm_mem_range {
-	u64 start;
-	u64 end;
-};
-
 /**
  * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
  * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
@@ -477,24 +467,4 @@  int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
  */
 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
 			 kvm_pte_t *ptep, u32 *level);
-
-/**
- * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
- *				     Addresses with compatible permission
- *				     attributes.
- * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
- * @addr:	Address that must be covered by the range.
- * @prot:	Protection attributes that the range must be compatible with.
- * @range:	Range structure used to limit the search space at call time and
- *		that will hold the result.
- *
- * The offset of @addr within a page is ignored. An IPA is compatible with @prot
- * iff its corresponding stage-2 page-table entry has default ownership and, if
- * valid, is mapped with protection attributes identical to @prot.
- *
- * Return: 0 on success, negative error code on failure.
- */
-int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
-				  enum kvm_pgtable_prot prot,
-				  struct kvm_mem_range *range);
 #endif	/* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 871149246f5f..01700a908bb7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -159,6 +159,11 @@  static int host_stage2_unmap_dev_all(void)
 	return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
 }
 
+struct kvm_mem_range {
+	u64 start;
+	u64 end;
+};
+
 static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
 {
 	int cur, left = 0, right = hyp_memblock_nr;
@@ -227,6 +232,38 @@  static inline int __host_stage2_idmap(u64 start, u64 end,
 		__ret;							\
 	 })
 
+static int host_stage2_find_range(u64 addr, struct kvm_mem_range *range)
+{
+	u64 granule, start, end;
+	kvm_pte_t pte;
+	u32 level;
+	int ret;
+
+	ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
+	if (ret)
+		return ret;
+
+	if (kvm_pte_valid(pte))
+		return -EAGAIN;
+
+	if (pte)
+		return -EPERM;
+
+	do {
+		granule = kvm_granule_size(level);
+		start = ALIGN_DOWN(addr, granule);
+		end = start + granule;
+		level++;
+	} while ((level < KVM_PGTABLE_MAX_LEVELS) &&
+			(!kvm_level_supports_block_mapping(level) ||
+			 start < range->start || range->end < end));
+
+	range->start = start;
+	range->end = end;
+
+	return 0;
+}
+
 static int host_stage2_idmap(u64 addr)
 {
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
@@ -238,7 +275,7 @@  static int host_stage2_idmap(u64 addr)
 		prot |= KVM_PGTABLE_PROT_X;
 
 	hyp_spin_lock(&host_kvm.lock);
-	ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+	ret = host_stage2_find_range(addr, &range);
 	if (ret)
 		goto unlock;
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2c5d4d3e31cc..55199e579863 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1115,77 +1115,3 @@  void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
 	pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
 	pgt->pgd = NULL;
 }
-
-#define KVM_PTE_LEAF_S2_COMPAT_MASK	(KVM_PTE_LEAF_ATTR_S2_PERMS | \
-					 KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
-					 KVM_PTE_LEAF_ATTR_S2_IGNORED)
-
-static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
-					  kvm_pte_t *ptep,
-					  enum kvm_pgtable_walk_flags flag,
-					  void * const arg)
-{
-	kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
-
-	/*
-	 * Compatible mappings are either invalid and owned by the page-table
-	 * owner (whose id is 0), or valid with matching permission attributes.
-	 */
-	if (kvm_pte_valid(pte)) {
-		old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
-		if (old_attr != *new_attr)
-			return -EEXIST;
-	} else if (pte) {
-		return -EEXIST;
-	}
-
-	return 0;
-}
-
-int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
-				  enum kvm_pgtable_prot prot,
-				  struct kvm_mem_range *range)
-{
-	kvm_pte_t attr;
-	struct kvm_pgtable_walker check_perm_walker = {
-		.cb		= stage2_check_permission_walker,
-		.flags		= KVM_PGTABLE_WALK_LEAF,
-		.arg		= &attr,
-	};
-	u64 granule, start, end;
-	u32 level;
-	int ret;
-
-	ret = stage2_set_prot_attr(pgt, prot, &attr);
-	if (ret)
-		return ret;
-	attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
-
-	for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
-		granule = kvm_granule_size(level);
-		start = ALIGN_DOWN(addr, granule);
-		end = start + granule;
-
-		if (!kvm_level_supports_block_mapping(level))
-			continue;
-
-		if (start < range->start || range->end < end)
-			continue;
-
-		/*
-		 * Check the presence of existing mappings with incompatible
-		 * permissions within the current block range, and try one level
-		 * deeper if one is found.
-		 */
-		ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
-		if (ret != -EEXIST)
-			break;
-	}
-
-	if (!ret) {
-		range->start = start;
-		range->end = end;
-	}
-
-	return ret;
-}