diff mbox series

[v3,04/12] KVM: arm64: Add kvm_pgtable_stage2_split()

Message ID 20230215174046.2201432-5-ricarkol@google.com (mailing list archive)
State New, archived
Headers show
Series Implement Eager Page Splitting for ARM | expand

Commit Message

Ricardo Koller Feb. 15, 2023, 5:40 p.m. UTC
Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
range of huge pages. This will be used for eager-splitting huge pages
into PAGE_SIZE pages. The goal is to avoid having to split huge pages
on write-protection faults, and instead use this function to do it
ahead of time for large ranges (e.g., all guest memory in 1G chunks at
a time).

No functional change intended. This new function will be used in a
subsequent commit.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

Comments

Oliver Upton Feb. 16, 2023, 12:36 a.m. UTC | #1
On Wed, Feb 15, 2023 at 05:40:38PM +0000, Ricardo Koller wrote:

[...]

> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index fed314f2b320..e2fb78398b3d 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
>  	return 0;
>  }
>  
> +struct stage2_split_data {
> +	struct kvm_s2_mmu		*mmu;
> +	void				*memcache;
> +	u64				mc_capacity;
> +};
> +
> +/*
> + * Get the number of page-tables needed to replace a bock with a fully
> + * populated tree, up to the PTE level, at particular level.
> + */
> +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> +{
> +	switch (level) {
> +	/* There are no blocks at level 0 */
> +	case 1: return 1 + PTRS_PER_PTE;
> +	case 2: return 1;
> +	case 3: return 0;
> +	default:
> +		WARN_ON_ONCE(1);
> +		return ~0;
> +	}
> +}

This doesn't take into account our varying degrees of hugepage support
across page sizes. Perhaps:

  static inline int stage2_block_get_nr_page_tables(u32 level)
  {
          if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
	                   level >= KVM_PGTABLE_MAX_LEVELS))
	          return -EINVAL;

          switch (level) {
	  case 1:
	  	return PTRS_PER_PTE + 1;
	  case 2:
	  	return 1;
	  case 3:
	  	return 0;
	  }
  }

paired with an explicit error check and early return on the caller side.

> +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> +			       enum kvm_pgtable_walk_flags visit)
> +{
> +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> +	struct stage2_split_data *data = ctx->arg;
> +	kvm_pte_t pte = ctx->old, new, *childp;
> +	enum kvm_pgtable_prot prot;
> +	void *mc = data->memcache;
> +	u32 level = ctx->level;
> +	u64 phys, nr_pages;
> +	bool force_pte;
> +	int ret;
> +
> +	/* No huge-pages exist at the last level */
> +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> +		return 0;
> +
> +	/* We only split valid block mappings */
> +	if (!kvm_pte_valid(pte))
> +		return 0;
> +
> +	nr_pages = stage2_block_get_nr_page_tables(level);
> +	if (data->mc_capacity >= nr_pages) {
> +		/* Build a tree mapped down to the PTE granularity. */
> +		force_pte = true;
> +	} else {
> +		/*
> +		 * Don't force PTEs. This requires a single page of PMDs at the
> +		 * PUD level, or a single page of PTEs at the PMD level. If we
> +		 * are at the PUD level, the PTEs will be created recursively.
> +		 */
> +		force_pte = false;
> +		nr_pages = 1;
> +	}

Do we know if the 'else' branch here is even desirable? I.e. has
recursive shattering been tested with PUD hugepages (HugeTLB 1G) and
shown to improve guest performance while dirty tracking?

The observations we've made on existing systems were that the successive
break-before-make operations led to a measurable slowdown in guest
pre-copy performance. Recursively building the page tables should
actually result in *more* break-before-makes than if we just let the vCPU
fault path lazily shatter hugepages.
Shaoqin Huang Feb. 16, 2023, 12:22 p.m. UTC | #2
Hi Ricardo,

On 2/16/23 01:40, Ricardo Koller wrote:
> Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> range of huge pages. This will be used for eager-splitting huge pages
> into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> on write-protection faults, and instead use this function to do it
> ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> a time).
>
> No functional change intended. This new function will be used in a
> subsequent commit.
>
> Signed-off-by: Ricardo Koller <ricarkol@google.com>
> ---
>   arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
>   arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
>   2 files changed, 135 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 2ea397ad3e63..b28489aa0994 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -658,6 +658,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
>    */
>   int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
>   
> +/**
> + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> + *				to PAGE_SIZE guest pages.
> + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	 Intermediate physical address from which to split.
> + * @size:	 Size of the range.
> + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> + *		 page-table pages.
> + * @mc_capacity: Number of pages in @mc.
> + *
> + * @addr and the end (@addr + @size) are effectively aligned down and up to
> + * the top level huge-page block size. This is an example using 1GB
> + * huge-pages and 4KB granules.
> + *
> + *                          [---input range---]
> + *                          :                 :
> + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> + *                          :                 :
> + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> + *                          :                 :
> + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> + *                          :                 :
> + *
> + * Return: 0 on success, negative error code on failure. Note that
> + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> + * blocks in the input range as allowed by @mc_capacity.
> + */
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity);
> +
>   /**
>    * kvm_pgtable_walk() - Walk a page-table.
>    * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index fed314f2b320..e2fb78398b3d 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
>   	return 0;
>   }
>   
> +struct stage2_split_data {
> +	struct kvm_s2_mmu		*mmu;
> +	void				*memcache;
> +	u64				mc_capacity;
> +};
> +
> +/*
> + * Get the number of page-tables needed to replace a bock with a fully

/s/bock/block

Thanks,

> + * populated tree, up to the PTE level, at particular level.
> + */
> +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> +{
> +	switch (level) {
> +	/* There are no blocks at level 0 */
> +	case 1: return 1 + PTRS_PER_PTE;
> +	case 2: return 1;
> +	case 3: return 0;
> +	default:
> +		WARN_ON_ONCE(1);
> +		return ~0;
> +	}
> +}
> +
> +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> +			       enum kvm_pgtable_walk_flags visit)
> +{
> +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> +	struct stage2_split_data *data = ctx->arg;
> +	kvm_pte_t pte = ctx->old, new, *childp;
> +	enum kvm_pgtable_prot prot;
> +	void *mc = data->memcache;
> +	u32 level = ctx->level;
> +	u64 phys, nr_pages;
> +	bool force_pte;
> +	int ret;
> +
> +	/* No huge-pages exist at the last level */
> +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> +		return 0;
> +
> +	/* We only split valid block mappings */
> +	if (!kvm_pte_valid(pte))
> +		return 0;
> +
> +	nr_pages = stage2_block_get_nr_page_tables(level);
> +	if (data->mc_capacity >= nr_pages) {
> +		/* Build a tree mapped down to the PTE granularity. */
> +		force_pte = true;
> +	} else {
> +		/*
> +		 * Don't force PTEs. This requires a single page of PMDs at the
> +		 * PUD level, or a single page of PTEs at the PMD level. If we
> +		 * are at the PUD level, the PTEs will be created recursively.
> +		 */
> +		force_pte = false;
> +		nr_pages = 1;
> +	}
> +
> +	if (data->mc_capacity < nr_pages)
> +		return -ENOMEM;
> +
> +	phys = kvm_pte_to_phys(pte);
> +	prot = kvm_pgtable_stage2_pte_prot(pte);
> +
> +	ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> +						 level, prot, mc, force_pte);
> +	if (ret)
> +		return ret;
> +
> +	if (!stage2_try_break_pte(ctx, data->mmu)) {
> +		childp = kvm_pte_follow(new, mm_ops);
> +		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> +		mm_ops->put_page(childp);
> +		return -EAGAIN;
> +	}
> +
> +	/*
> +	 * Note, the contents of the page table are guaranteed to be made
> +	 * visible before the new PTE is assigned because stage2_make_pte()
> +	 * writes the PTE using smp_store_release().
> +	 */
> +	stage2_make_pte(ctx, new);
> +	dsb(ishst);
> +	data->mc_capacity -= nr_pages;
> +	return 0;
> +}
> +
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity)
> +{
> +	struct stage2_split_data split_data = {
> +		.mmu		= pgt->mmu,
> +		.memcache	= mc,
> +		.mc_capacity	= mc_capacity,
> +	};
> +
> +	struct kvm_pgtable_walker walker = {
> +		.cb	= stage2_split_walker,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +		.arg	= &split_data,
> +	};
> +
> +	return kvm_pgtable_walk(pgt, addr, size, &walker);
> +}
> +
>   int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
>   			      struct kvm_pgtable_mm_ops *mm_ops,
>   			      enum kvm_pgtable_stage2_flags flags,
Ricardo Koller Feb. 16, 2023, 6:14 p.m. UTC | #3
On Thu, Feb 16, 2023 at 10:07 AM Ricardo Koller <ricarkol@google.com> wrote:
>
> On Wed, Feb 15, 2023 at 4:36 PM Oliver Upton <oliver.upton@linux.dev> wrote:
> >
> > On Wed, Feb 15, 2023 at 05:40:38PM +0000, Ricardo Koller wrote:
> >
> > [...]
> >
> > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > index fed314f2b320..e2fb78398b3d 100644
> > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
> > >       return 0;
> > >  }
> > >
> > > +struct stage2_split_data {
> > > +     struct kvm_s2_mmu               *mmu;
> > > +     void                            *memcache;
> > > +     u64                             mc_capacity;
> > > +};
> > > +
> > > +/*
> > > + * Get the number of page-tables needed to replace a bock with a fully
> > > + * populated tree, up to the PTE level, at particular level.
> > > + */
> > > +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> > > +{
> > > +     switch (level) {
> > > +     /* There are no blocks at level 0 */
> > > +     case 1: return 1 + PTRS_PER_PTE;
> > > +     case 2: return 1;
> > > +     case 3: return 0;
> > > +     default:
> > > +             WARN_ON_ONCE(1);
> > > +             return ~0;
> > > +     }
> > > +}
> >
> > This doesn't take into account our varying degrees of hugepage support
> > across page sizes. Perhaps:
> >
> >   static inline int stage2_block_get_nr_page_tables(u32 level)
> >   {
> >           if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
> >                            level >= KVM_PGTABLE_MAX_LEVELS))
> >                   return -EINVAL;
> >
> >           switch (level) {
> >           case 1:
> >                 return PTRS_PER_PTE + 1;
> >           case 2:
> >                 return 1;
> >           case 3:
> >                 return 0;
> >           }
> >   }
> >
> > paired with an explicit error check and early return on the caller side.
>
> Sounds good, will add this to the next version.
>
> >
> > > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > > +                            enum kvm_pgtable_walk_flags visit)
> > > +{
> > > +     struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> > > +     struct stage2_split_data *data = ctx->arg;
> > > +     kvm_pte_t pte = ctx->old, new, *childp;
> > > +     enum kvm_pgtable_prot prot;
> > > +     void *mc = data->memcache;
> > > +     u32 level = ctx->level;
> > > +     u64 phys, nr_pages;
> > > +     bool force_pte;
> > > +     int ret;
> > > +
> > > +     /* No huge-pages exist at the last level */
> > > +     if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > > +             return 0;
> > > +
> > > +     /* We only split valid block mappings */
> > > +     if (!kvm_pte_valid(pte))
> > > +             return 0;
> > > +
> > > +     nr_pages = stage2_block_get_nr_page_tables(level);
> > > +     if (data->mc_capacity >= nr_pages) {
> > > +             /* Build a tree mapped down to the PTE granularity. */
> > > +             force_pte = true;
> > > +     } else {
> > > +             /*
> > > +              * Don't force PTEs. This requires a single page of PMDs at the
> > > +              * PUD level, or a single page of PTEs at the PMD level. If we
> > > +              * are at the PUD level, the PTEs will be created recursively.
> > > +              */
> > > +             force_pte = false;
> > > +             nr_pages = 1;
> > > +     }
> >
> > Do we know if the 'else' branch here is even desirable? I.e. has
> > recursive shattering been tested with PUD hugepages (HugeTLB 1G) and
> > shown to improve guest performance while dirty tracking?
>
> Yes, I think it's desirable. Here are some numbers on a neoverse n1 using
> dirty_log_perf_test (152 vcpus, 1G each, 4K pages):
>
> CHUNK_SIZE=1G
> Enabling dirty logging time: 2.468014046s
> Iteration 1 dirty memory time: 4.275447900s
>
> CHUNK_SIZE=2M
> Enabling dirty logging time: 2.692124099s
> Iteration 1 dirty memory time: 4.284682220s
>
> Enabling dirty logging increases as expected when using a smaller CHUNK_SIZE,
> but not by too much (~9%). It's a fair tradeoff for users not willing
> to allocate large
> caches.
>
> >
> > The observations we've made on existing systems were that the successive
> > break-before-make operations led to a measurable slowdown in guest
> > pre-copy performance. Recursively building the page tables should
> > actually result in *more* break-before-makes than if we just let the vCPU
> > fault path lazily shatter hugepages.
> >
> > --
> > Thanks,
> > Oliver

There is a terribly offensive image that was attached to the previous
email. I would like to apologize for
attaching it. I don't know how that happened.

Ricardo
Ricardo Koller Feb. 16, 2023, 6:30 p.m. UTC | #4
On Thu, Feb 16, 2023 at 4:22 AM Shaoqin Huang <shahuang@redhat.com> wrote:
>
> Hi Ricardo,
>
> On 2/16/23 01:40, Ricardo Koller wrote:
> > Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> > range of huge pages. This will be used for eager-splitting huge pages
> > into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> > on write-protection faults, and instead use this function to do it
> > ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> > a time).
> >
> > No functional change intended. This new function will be used in a
> > subsequent commit.
> >
> > Signed-off-by: Ricardo Koller <ricarkol@google.com>
> > ---
> >   arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
> >   arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
> >   2 files changed, 135 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > index 2ea397ad3e63..b28489aa0994 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -658,6 +658,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
> >    */
> >   int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
> >
> > +/**
> > + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> > + *                           to PAGE_SIZE guest pages.
> > + * @pgt:      Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:     Intermediate physical address from which to split.
> > + * @size:     Size of the range.
> > + * @mc:               Cache of pre-allocated and zeroed memory from which to allocate
> > + *            page-table pages.
> > + * @mc_capacity: Number of pages in @mc.
> > + *
> > + * @addr and the end (@addr + @size) are effectively aligned down and up to
> > + * the top level huge-page block size. This is an example using 1GB
> > + * huge-pages and 4KB granules.
> > + *
> > + *                          [---input range---]
> > + *                          :                 :
> > + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> > + *                          :                 :
> > + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> > + *                          :                 :
> > + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> > + *                          :                 :
> > + *
> > + * Return: 0 on success, negative error code on failure. Note that
> > + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> > + * blocks in the input range as allowed by @mc_capacity.
> > + */
> > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +                          void *mc, u64 mc_capacity);
> > +
> >   /**
> >    * kvm_pgtable_walk() - Walk a page-table.
> >    * @pgt:    Page-table structure initialised by kvm_pgtable_*_init().
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index fed314f2b320..e2fb78398b3d 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
> >       return 0;
> >   }
> >
> > +struct stage2_split_data {
> > +     struct kvm_s2_mmu               *mmu;
> > +     void                            *memcache;
> > +     u64                             mc_capacity;
> > +};
> > +
> > +/*
> > + * Get the number of page-tables needed to replace a bock with a fully
>
> /s/bock/block

ACK. Will fix this on v4.

>
> Thanks,
>
> > + * populated tree, up to the PTE level, at particular level.
> > + */
> > +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> > +{
> > +     switch (level) {
> > +     /* There are no blocks at level 0 */
> > +     case 1: return 1 + PTRS_PER_PTE;
> > +     case 2: return 1;
> > +     case 3: return 0;
> > +     default:
> > +             WARN_ON_ONCE(1);
> > +             return ~0;
> > +     }
> > +}
> > +
> > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > +                            enum kvm_pgtable_walk_flags visit)
> > +{
> > +     struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> > +     struct stage2_split_data *data = ctx->arg;
> > +     kvm_pte_t pte = ctx->old, new, *childp;
> > +     enum kvm_pgtable_prot prot;
> > +     void *mc = data->memcache;
> > +     u32 level = ctx->level;
> > +     u64 phys, nr_pages;
> > +     bool force_pte;
> > +     int ret;
> > +
> > +     /* No huge-pages exist at the last level */
> > +     if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > +             return 0;
> > +
> > +     /* We only split valid block mappings */
> > +     if (!kvm_pte_valid(pte))
> > +             return 0;
> > +
> > +     nr_pages = stage2_block_get_nr_page_tables(level);
> > +     if (data->mc_capacity >= nr_pages) {
> > +             /* Build a tree mapped down to the PTE granularity. */
> > +             force_pte = true;
> > +     } else {
> > +             /*
> > +              * Don't force PTEs. This requires a single page of PMDs at the
> > +              * PUD level, or a single page of PTEs at the PMD level. If we
> > +              * are at the PUD level, the PTEs will be created recursively.
> > +              */
> > +             force_pte = false;
> > +             nr_pages = 1;
> > +     }
> > +
> > +     if (data->mc_capacity < nr_pages)
> > +             return -ENOMEM;
> > +
> > +     phys = kvm_pte_to_phys(pte);
> > +     prot = kvm_pgtable_stage2_pte_prot(pte);
> > +
> > +     ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> > +                                              level, prot, mc, force_pte);
> > +     if (ret)
> > +             return ret;
> > +
> > +     if (!stage2_try_break_pte(ctx, data->mmu)) {
> > +             childp = kvm_pte_follow(new, mm_ops);
> > +             kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> > +             mm_ops->put_page(childp);
> > +             return -EAGAIN;
> > +     }
> > +
> > +     /*
> > +      * Note, the contents of the page table are guaranteed to be made
> > +      * visible before the new PTE is assigned because stage2_make_pte()
> > +      * writes the PTE using smp_store_release().
> > +      */
> > +     stage2_make_pte(ctx, new);
> > +     dsb(ishst);
> > +     data->mc_capacity -= nr_pages;
> > +     return 0;
> > +}
> > +
> > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +                          void *mc, u64 mc_capacity)
> > +{
> > +     struct stage2_split_data split_data = {
> > +             .mmu            = pgt->mmu,
> > +             .memcache       = mc,
> > +             .mc_capacity    = mc_capacity,
> > +     };
> > +
> > +     struct kvm_pgtable_walker walker = {
> > +             .cb     = stage2_split_walker,
> > +             .flags  = KVM_PGTABLE_WALK_LEAF,
> > +             .arg    = &split_data,
> > +     };
> > +
> > +     return kvm_pgtable_walk(pgt, addr, size, &walker);
> > +}
> > +
> >   int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
> >                             struct kvm_pgtable_mm_ops *mm_ops,
> >                             enum kvm_pgtable_stage2_flags flags,
>
> --
> Regards,
> Shaoqin
>
Shaoqin Huang Feb. 17, 2023, 4:07 a.m. UTC | #5
Hi Ricardo,

On 2/16/23 01:40, Ricardo Koller wrote:
> Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> range of huge pages. This will be used for eager-splitting huge pages
> into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> on write-protection faults, and instead use this function to do it
> ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> a time).
>
> No functional change intended. This new function will be used in a
> subsequent commit.
>
> Signed-off-by: Ricardo Koller <ricarkol@google.com>
> ---
>   arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
>   arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
>   2 files changed, 135 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 2ea397ad3e63..b28489aa0994 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -658,6 +658,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
>    */
>   int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
>   
> +/**
> + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> + *				to PAGE_SIZE guest pages.
> + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	 Intermediate physical address from which to split.
> + * @size:	 Size of the range.
> + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> + *		 page-table pages.
> + * @mc_capacity: Number of pages in @mc.
> + *
> + * @addr and the end (@addr + @size) are effectively aligned down and up to
> + * the top level huge-page block size. This is an example using 1GB
> + * huge-pages and 4KB granules.
> + *
> + *                          [---input range---]
> + *                          :                 :
> + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> + *                          :                 :
> + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> + *                          :                 :
> + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> + *                          :                 :
> + *
> + * Return: 0 on success, negative error code on failure. Note that
> + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> + * blocks in the input range as allowed by @mc_capacity.
> + */
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity);
> +
>   /**
>    * kvm_pgtable_walk() - Walk a page-table.
>    * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index fed314f2b320..e2fb78398b3d 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
>   	return 0;
>   }
>   
> +struct stage2_split_data {
> +	struct kvm_s2_mmu		*mmu;
> +	void				*memcache;
> +	u64				mc_capacity;
> +};
> +
> +/*
> + * Get the number of page-tables needed to replace a bock with a fully
> + * populated tree, up to the PTE level, at particular level.
> + */
> +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> +{
> +	switch (level) {
> +	/* There are no blocks at level 0 */
> +	case 1: return 1 + PTRS_PER_PTE;
> +	case 2: return 1;
> +	case 3: return 0;

Is the level 3 check really needed?
Because level 3 will be first detected by:

+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return 0;

in the stage2_split_walker().

Even if the case 3 return 0 and continue execute, it will still trigger:
kvm_pgtable_stage2_create_unlinked(..level..)
   -> __kvm_pgtable_walk(..level+1..)
     -> WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)

Thanks,
Shaoqin

> +	default:
> +		WARN_ON_ONCE(1);
> +		return ~0;
> +	}
> +}
> +
> +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> +			       enum kvm_pgtable_walk_flags visit)
> +{
> +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> +	struct stage2_split_data *data = ctx->arg;
> +	kvm_pte_t pte = ctx->old, new, *childp;
> +	enum kvm_pgtable_prot prot;
> +	void *mc = data->memcache;
> +	u32 level = ctx->level;
> +	u64 phys, nr_pages;
> +	bool force_pte;
> +	int ret;
> +
> +	/* No huge-pages exist at the last level */
> +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> +		return 0;
> +
> +	/* We only split valid block mappings */
> +	if (!kvm_pte_valid(pte))
> +		return 0;
> +
> +	nr_pages = stage2_block_get_nr_page_tables(level);
> +	if (data->mc_capacity >= nr_pages) {
> +		/* Build a tree mapped down to the PTE granularity. */
> +		force_pte = true;
> +	} else {
> +		/*
> +		 * Don't force PTEs. This requires a single page of PMDs at the
> +		 * PUD level, or a single page of PTEs at the PMD level. If we
> +		 * are at the PUD level, the PTEs will be created recursively.
> +		 */
> +		force_pte = false;
> +		nr_pages = 1;
> +	}
> +
> +	if (data->mc_capacity < nr_pages)
> +		return -ENOMEM;
> +
> +	phys = kvm_pte_to_phys(pte);
> +	prot = kvm_pgtable_stage2_pte_prot(pte);
> +
> +	ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> +						 level, prot, mc, force_pte);
> +	if (ret)
> +		return ret;
> +
> +	if (!stage2_try_break_pte(ctx, data->mmu)) {
> +		childp = kvm_pte_follow(new, mm_ops);
> +		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> +		mm_ops->put_page(childp);
> +		return -EAGAIN;
> +	}
> +
> +	/*
> +	 * Note, the contents of the page table are guaranteed to be made
> +	 * visible before the new PTE is assigned because stage2_make_pte()
> +	 * writes the PTE using smp_store_release().
> +	 */
> +	stage2_make_pte(ctx, new);
> +	dsb(ishst);
> +	data->mc_capacity -= nr_pages;
> +	return 0;
> +}
> +
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity)
> +{
> +	struct stage2_split_data split_data = {
> +		.mmu		= pgt->mmu,
> +		.memcache	= mc,
> +		.mc_capacity	= mc_capacity,
> +	};
> +
> +	struct kvm_pgtable_walker walker = {
> +		.cb	= stage2_split_walker,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +		.arg	= &split_data,
> +	};
> +
> +	return kvm_pgtable_walk(pgt, addr, size, &walker);
> +}
> +
>   int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
>   			      struct kvm_pgtable_mm_ops *mm_ops,
>   			      enum kvm_pgtable_stage2_flags flags,
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 2ea397ad3e63..b28489aa0994 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -658,6 +658,36 @@  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
  */
 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
+/**
+ * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
+ *				to PAGE_SIZE guest pages.
+ * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	 Intermediate physical address from which to split.
+ * @size:	 Size of the range.
+ * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
+ *		 page-table pages.
+ * @mc_capacity: Number of pages in @mc.
+ *
+ * @addr and the end (@addr + @size) are effectively aligned down and up to
+ * the top level huge-page block size. This is an example using 1GB
+ * huge-pages and 4KB granules.
+ *
+ *                          [---input range---]
+ *                          :                 :
+ * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
+ *                          :                 :
+ *                   [--2MB--][--2MB--][--2MB--][--2MB--]
+ *                          :                 :
+ *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
+ *                          :                 :
+ *
+ * Return: 0 on success, negative error code on failure. Note that
+ * kvm_pgtable_stage2_split() is best effort: it tries to break as many
+ * blocks in the input range as allowed by @mc_capacity.
+ */
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			     void *mc, u64 mc_capacity);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index fed314f2b320..e2fb78398b3d 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1229,6 +1229,111 @@  int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
 	return 0;
 }
 
+struct stage2_split_data {
+	struct kvm_s2_mmu		*mmu;
+	void				*memcache;
+	u64				mc_capacity;
+};
+
+/*
+ * Get the number of page-tables needed to replace a bock with a fully
+ * populated tree, up to the PTE level, at particular level.
+ */
+static inline u32 stage2_block_get_nr_page_tables(u32 level)
+{
+	switch (level) {
+	/* There are no blocks at level 0 */
+	case 1: return 1 + PTRS_PER_PTE;
+	case 2: return 1;
+	case 3: return 0;
+	default:
+		WARN_ON_ONCE(1);
+		return ~0;
+	}
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+			       enum kvm_pgtable_walk_flags visit)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+	struct stage2_split_data *data = ctx->arg;
+	kvm_pte_t pte = ctx->old, new, *childp;
+	enum kvm_pgtable_prot prot;
+	void *mc = data->memcache;
+	u32 level = ctx->level;
+	u64 phys, nr_pages;
+	bool force_pte;
+	int ret;
+
+	/* No huge-pages exist at the last level */
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return 0;
+
+	/* We only split valid block mappings */
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	nr_pages = stage2_block_get_nr_page_tables(level);
+	if (data->mc_capacity >= nr_pages) {
+		/* Build a tree mapped down to the PTE granularity. */
+		force_pte = true;
+	} else {
+		/*
+		 * Don't force PTEs. This requires a single page of PMDs at the
+		 * PUD level, or a single page of PTEs at the PMD level. If we
+		 * are at the PUD level, the PTEs will be created recursively.
+		 */
+		force_pte = false;
+		nr_pages = 1;
+	}
+
+	if (data->mc_capacity < nr_pages)
+		return -ENOMEM;
+
+	phys = kvm_pte_to_phys(pte);
+	prot = kvm_pgtable_stage2_pte_prot(pte);
+
+	ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
+						 level, prot, mc, force_pte);
+	if (ret)
+		return ret;
+
+	if (!stage2_try_break_pte(ctx, data->mmu)) {
+		childp = kvm_pte_follow(new, mm_ops);
+		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+		mm_ops->put_page(childp);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Note, the contents of the page table are guaranteed to be made
+	 * visible before the new PTE is assigned because stage2_make_pte()
+	 * writes the PTE using smp_store_release().
+	 */
+	stage2_make_pte(ctx, new);
+	dsb(ishst);
+	data->mc_capacity -= nr_pages;
+	return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			     void *mc, u64 mc_capacity)
+{
+	struct stage2_split_data split_data = {
+		.mmu		= pgt->mmu,
+		.memcache	= mc,
+		.mc_capacity	= mc_capacity,
+	};
+
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_split_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg	= &split_data,
+	};
+
+	return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			      struct kvm_pgtable_mm_ops *mm_ops,
 			      enum kvm_pgtable_stage2_flags flags,