diff mbox series

[v6,04/12] KVM: arm64: Add kvm_pgtable_stage2_split()

Message ID 20230307034555.39733-5-ricarkol@google.com (mailing list archive)
State New, archived
Headers show
Series Implement Eager Page Splitting for ARM | expand

Commit Message

Ricardo Koller March 7, 2023, 3:45 a.m. UTC
Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
range of huge pages. This will be used for eager-splitting huge pages
into PAGE_SIZE pages. The goal is to avoid having to split huge pages
on write-protection faults, and instead use this function to do it
ahead of time for large ranges (e.g., all guest memory in 1G chunks at
a time).

No functional change intended. This new function will be used in a
subsequent commit.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  30 +++++++
 arch/arm64/kvm/hyp/pgtable.c         | 113 +++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)

Comments

Marc Zyngier March 12, 2023, 11:35 a.m. UTC | #1
On Tue, 07 Mar 2023 03:45:47 +0000,
Ricardo Koller <ricarkol@google.com> wrote:
> 
> Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> range of huge pages. This will be used for eager-splitting huge pages
> into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> on write-protection faults, and instead use this function to do it
> ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> a time).
> 
> No functional change intended. This new function will be used in a
> subsequent commit.

Same comment as before about the usefulness of this last sentence.

> 
> Signed-off-by: Ricardo Koller <ricarkol@google.com>
> Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h |  30 +++++++
>  arch/arm64/kvm/hyp/pgtable.c         | 113 +++++++++++++++++++++++++++
>  2 files changed, 143 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index b7b3fc0fa7a5..40e323a718fc 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -665,6 +665,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
>   */
>  int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
>  
> +/**
> + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> + *				to PAGE_SIZE guest pages.
> + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	 Intermediate physical address from which to split.
> + * @size:	 Size of the range.
> + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> + *		 page-table pages.
> + * @mc_capacity: Number of pages in @mc.
> + *
> + * @addr and the end (@addr + @size) are effectively aligned down and up to
> + * the top level huge-page block size. This is an example using 1GB
> + * huge-pages and 4KB granules.
> + *
> + *                          [---input range---]
> + *                          :                 :
> + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> + *                          :                 :
> + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> + *                          :                 :
> + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> + *                          :                 :

So here, what alignment do we effectively get?

> + *
> + * Return: 0 on success, negative error code on failure. Note that
> + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> + * blocks in the input range as allowed by @mc_capacity.
> + */
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity);
> +
>  /**
>   * kvm_pgtable_walk() - Walk a page-table.
>   * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 6bdfcb671b32..3149b98d1701 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -1259,6 +1259,119 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
>  	return pgtable;
>  }
>  
> +struct stage2_split_data {
> +	struct kvm_s2_mmu		*mmu;
> +	void				*memcache;
> +	u64				mc_capacity;

Why isn't this a pointer to a *real* memcache structure?

> +};
> +
> +/*
> + * Get the number of page-tables needed to replace a block with a
> + * fully populated tree, up to the PTE level, at particular level.
> + */
> +static inline int stage2_block_get_nr_page_tables(u32 level)

Please drop the inline. The compiler will figure it out.

> +{
> +	if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
> +			 level >= KVM_PGTABLE_MAX_LEVELS))
> +		return -EINVAL;

Move this check to the 'default' case below.

> +
> +	switch (level) {
> +	case 1:
> +		return PTRS_PER_PTE + 1;
> +	case 2:
> +		return 1;

This is odd. Replacing a block by a table always requires
'PTRS_PER_PTE + 1' pages. Why 1? If this is some special treatment for
level-2 mappings, please spell it out.

> +	case 3:
> +		return 0;
> +	default:
> +		return -EINVAL;
> +	};
> +}
> +
> +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> +			       enum kvm_pgtable_walk_flags visit)
> +{
> +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> +	struct stage2_split_data *data = ctx->arg;
> +	kvm_pte_t pte = ctx->old, new, *childp;
> +	enum kvm_pgtable_prot prot;
> +	void *mc = data->memcache;
> +	u32 level = ctx->level;
> +	bool force_pte;
> +	int nr_pages;
> +	u64 phys;
> +
> +	/* No huge-pages exist at the last level */
> +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> +		return 0;

Why the check for level 3 in the previous function if never get there?

> +
> +	/* We only split valid block mappings */
> +	if (!kvm_pte_valid(pte))
> +		return 0;
> +
> +	nr_pages = stage2_block_get_nr_page_tables(level);
> +	if (nr_pages < 0)
> +		return nr_pages;
> +
> +	if (data->mc_capacity >= nr_pages) {
> +		/* Build a tree mapped down to the PTE granularity. */
> +		force_pte = true;
> +	} else {
> +		/*
> +		 * Don't force PTEs. This requires a single page of PMDs at the
> +		 * PUD level, or a single page of PTEs at the PMD level. If we
> +		 * are at the PUD level, the PTEs will be created recursively.
> +		 */

I don't understand how you reach this 'single page' conclusion. You
need to explain why you get there.

> +		force_pte = false;
> +		nr_pages = 1;
> +	}
> +
> +	if (data->mc_capacity < nr_pages)
> +		return -ENOMEM;
> +
> +	phys = kvm_pte_to_phys(pte);
> +	prot = kvm_pgtable_stage2_pte_prot(pte);
> +
> +	childp = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, phys,
> +						    level, prot, mc, force_pte);
> +	if (IS_ERR(childp))
> +		return PTR_ERR(childp);
> +
> +	if (!stage2_try_break_pte(ctx, data->mmu)) {
> +		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> +		mm_ops->put_page(childp);
> +		return -EAGAIN;
> +	}
> +
> +	/*
> +	 * Note, the contents of the page table are guaranteed to be made
> +	 * visible before the new PTE is assigned because stage2_make_pte()
> +	 * writes the PTE using smp_store_release().
> +	 */
> +	new = kvm_init_table_pte(childp, mm_ops);
> +	stage2_make_pte(ctx, new);
> +	dsb(ishst);
> +	data->mc_capacity -= nr_pages;
> +	return 0;
> +}
> +
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity)
> +{
> +	struct stage2_split_data split_data = {
> +		.mmu		= pgt->mmu,
> +		.memcache	= mc,
> +		.mc_capacity	= mc_capacity,
> +	};
> +
> +	struct kvm_pgtable_walker walker = {
> +		.cb	= stage2_split_walker,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +		.arg	= &split_data,
> +	};
> +
> +	return kvm_pgtable_walk(pgt, addr, size, &walker);
> +}
> +
>  int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
>  			      struct kvm_pgtable_mm_ops *mm_ops,
>  			      enum kvm_pgtable_stage2_flags flags,

Thanks,

	M.
Ricardo Koller March 13, 2023, 11:58 p.m. UTC | #2
On Sun, Mar 12, 2023 at 11:35:01AM +0000, Marc Zyngier wrote:
> On Tue, 07 Mar 2023 03:45:47 +0000,
> Ricardo Koller <ricarkol@google.com> wrote:
> > 
> > Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> > range of huge pages. This will be used for eager-splitting huge pages
> > into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> > on write-protection faults, and instead use this function to do it
> > ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> > a time).
> > 
> > No functional change intended. This new function will be used in a
> > subsequent commit.
> 
> Same comment as before about the usefulness of this last sentence.
> 
> > 
> > Signed-off-by: Ricardo Koller <ricarkol@google.com>
> > Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h |  30 +++++++
> >  arch/arm64/kvm/hyp/pgtable.c         | 113 +++++++++++++++++++++++++++
> >  2 files changed, 143 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > index b7b3fc0fa7a5..40e323a718fc 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -665,6 +665,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
> >   */
> >  int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
> >  
> > +/**
> > + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> > + *				to PAGE_SIZE guest pages.
> > + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:	 Intermediate physical address from which to split.
> > + * @size:	 Size of the range.
> > + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> > + *		 page-table pages.
> > + * @mc_capacity: Number of pages in @mc.
> > + *
> > + * @addr and the end (@addr + @size) are effectively aligned down and up to
> > + * the top level huge-page block size. This is an example using 1GB
> > + * huge-pages and 4KB granules.
> > + *
> > + *                          [---input range---]
> > + *                          :                 :
> > + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> > + *                          :                 :
> > + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> > + *                          :                 :
> > + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> > + *                          :                 :
> 
> So here, what alignment do we effectively get?
>

The function tries to split any block that overlaps with the input
range. Here's another example that might be more helpful:

 *                                [---input range---]
 *                                :                 :
 * [--1G block pte--][--2MB--][--2MB--][--1G block pte--][--1G block pte--]

is split like this:

 * [--1G block pte--][--2MB--][ ][ ][ ][ ][ ][ ][ ][ ][ ][--1G block pte--]
                              <-------split range------->

I think I will just use this new description instead.

> > + * Return: 0 on success, negative error code on failure. Note that
> > + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> > + * blocks in the input range as allowed by @mc_capacity.
> > + */
> > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +			     void *mc, u64 mc_capacity);
> > +
> >  /**
> >   * kvm_pgtable_walk() - Walk a page-table.
> >   * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 6bdfcb671b32..3149b98d1701 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -1259,6 +1259,119 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
> >  	return pgtable;
> >  }
> >  
> > +struct stage2_split_data {
> > +	struct kvm_s2_mmu		*mmu;
> > +	void				*memcache;
> > +	u64				mc_capacity;
> 
> Why isn't this a pointer to a *real* memcache structure?
> 

Mainly because I wanted this function to be like the other pgtable.c
funtions that use opaque pointers to handle the vhe and nvhe cases. vhe
uses "struct kvm_mmu_memory_cache" while nvhe uses "struct hyp_pool".
This series only implements the vhe case but I didn't want to restrict
kvm_pgtable_stage2_split() to vhe just because of this. Just in case, I
have not tried it in nvhe.

> > +};
> > +
> > +/*
> > + * Get the number of page-tables needed to replace a block with a
> > + * fully populated tree, up to the PTE level, at particular level.
> > + */
> > +static inline int stage2_block_get_nr_page_tables(u32 level)
> 
> Please drop the inline. The compiler will figure it out.
> 
> > +{
> > +	if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
> > +			 level >= KVM_PGTABLE_MAX_LEVELS))
> > +		return -EINVAL;
> 
> Move this check to the 'default' case below.
> 
> > +
> > +	switch (level) {
> > +	case 1:
> > +		return PTRS_PER_PTE + 1;
> > +	case 2:
> > +		return 1;
> 
> This is odd. Replacing a block by a table always requires
> 'PTRS_PER_PTE + 1' pages. Why 1? If this is some special treatment for
> level-2 mappings, please spell it out.

I'm not sure I understand. I'm interpreting "level=X" as in "level X
entry".  More specifically, using PAGE_SIZE=4096 as an example:

a level 3 entry (a PTE): a 4096 block needs 0 page-table pages
a level 2 entry: a 2M block needs 1 page-table pages
a level 1 entry: a 1G block needs 512+1 page-table pages

> 
> > +	case 3:
> > +		return 0;
> > +	default:
> > +		return -EINVAL;
> > +	};
> > +}
> > +
> > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > +			       enum kvm_pgtable_walk_flags visit)
> > +{
> > +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> > +	struct stage2_split_data *data = ctx->arg;
> > +	kvm_pte_t pte = ctx->old, new, *childp;
> > +	enum kvm_pgtable_prot prot;
> > +	void *mc = data->memcache;
> > +	u32 level = ctx->level;
> > +	bool force_pte;
> > +	int nr_pages;
> > +	u64 phys;
> > +
> > +	/* No huge-pages exist at the last level */
> > +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > +		return 0;
> 
> Why the check for level 3 in the previous function if never get there?
> 

Was trying to make stage2_block_get_nr_page_tables() useful for other
cases. It's still correct for other cases to ask how many page-table
pages are needed for a PTE (stage2_block_get_nr_page_tables(3) -> 0).

> > +
> > +	/* We only split valid block mappings */
> > +	if (!kvm_pte_valid(pte))
> > +		return 0;
> > +
> > +	nr_pages = stage2_block_get_nr_page_tables(level);
> > +	if (nr_pages < 0)
> > +		return nr_pages;
> > +
> > +	if (data->mc_capacity >= nr_pages) {
> > +		/* Build a tree mapped down to the PTE granularity. */
> > +		force_pte = true;
> > +	} else {
> > +		/*
> > +		 * Don't force PTEs. This requires a single page of PMDs at the
> > +		 * PUD level, or a single page of PTEs at the PMD level. If we
> > +		 * are at the PUD level, the PTEs will be created recursively.
> > +		 */
> 
> I don't understand how you reach this 'single page' conclusion. You
> need to explain why you get there.

Ack, will expand it.

> 
> > +		force_pte = false;
> > +		nr_pages = 1;
> > +	}
> > +
> > +	if (data->mc_capacity < nr_pages)
> > +		return -ENOMEM;
> > +
> > +	phys = kvm_pte_to_phys(pte);
> > +	prot = kvm_pgtable_stage2_pte_prot(pte);
> > +
> > +	childp = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, phys,
> > +						    level, prot, mc, force_pte);
> > +	if (IS_ERR(childp))
> > +		return PTR_ERR(childp);
> > +
> > +	if (!stage2_try_break_pte(ctx, data->mmu)) {
> > +		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> > +		mm_ops->put_page(childp);
> > +		return -EAGAIN;
> > +	}
> > +
> > +	/*
> > +	 * Note, the contents of the page table are guaranteed to be made
> > +	 * visible before the new PTE is assigned because stage2_make_pte()
> > +	 * writes the PTE using smp_store_release().
> > +	 */
> > +	new = kvm_init_table_pte(childp, mm_ops);
> > +	stage2_make_pte(ctx, new);
> > +	dsb(ishst);
> > +	data->mc_capacity -= nr_pages;
> > +	return 0;
> > +}
> > +
> > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +			     void *mc, u64 mc_capacity)
> > +{
> > +	struct stage2_split_data split_data = {
> > +		.mmu		= pgt->mmu,
> > +		.memcache	= mc,
> > +		.mc_capacity	= mc_capacity,
> > +	};
> > +
> > +	struct kvm_pgtable_walker walker = {
> > +		.cb	= stage2_split_walker,
> > +		.flags	= KVM_PGTABLE_WALK_LEAF,
> > +		.arg	= &split_data,
> > +	};
> > +
> > +	return kvm_pgtable_walk(pgt, addr, size, &walker);
> > +}
> > +
> >  int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
> >  			      struct kvm_pgtable_mm_ops *mm_ops,
> >  			      enum kvm_pgtable_stage2_flags flags,
> 
> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
Marc Zyngier March 15, 2023, 6:09 p.m. UTC | #3
On Mon, 13 Mar 2023 23:58:30 +0000,
Ricardo Koller <ricarkol@google.com> wrote:
> 
> On Sun, Mar 12, 2023 at 11:35:01AM +0000, Marc Zyngier wrote:
> > On Tue, 07 Mar 2023 03:45:47 +0000,
> > Ricardo Koller <ricarkol@google.com> wrote:
> > > 
> > > Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> > > range of huge pages. This will be used for eager-splitting huge pages
> > > into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> > > on write-protection faults, and instead use this function to do it
> > > ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> > > a time).
> > > 
> > > No functional change intended. This new function will be used in a
> > > subsequent commit.
> > 
> > Same comment as before about the usefulness of this last sentence.
> > 
> > > 
> > > Signed-off-by: Ricardo Koller <ricarkol@google.com>
> > > Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
> > > ---
> > >  arch/arm64/include/asm/kvm_pgtable.h |  30 +++++++
> > >  arch/arm64/kvm/hyp/pgtable.c         | 113 +++++++++++++++++++++++++++
> > >  2 files changed, 143 insertions(+)
> > > 
> > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > > index b7b3fc0fa7a5..40e323a718fc 100644
> > > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > > @@ -665,6 +665,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
> > >   */
> > >  int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
> > >  
> > > +/**
> > > + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> > > + *				to PAGE_SIZE guest pages.
> > > + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> > > + * @addr:	 Intermediate physical address from which to split.
> > > + * @size:	 Size of the range.
> > > + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> > > + *		 page-table pages.
> > > + * @mc_capacity: Number of pages in @mc.
> > > + *
> > > + * @addr and the end (@addr + @size) are effectively aligned down and up to
> > > + * the top level huge-page block size. This is an example using 1GB
> > > + * huge-pages and 4KB granules.
> > > + *
> > > + *                          [---input range---]
> > > + *                          :                 :
> > > + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> > > + *                          :                 :
> > > + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> > > + *                          :                 :
> > > + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> > > + *                          :                 :
> > 
> > So here, what alignment do we effectively get?
> >
> 
> The function tries to split any block that overlaps with the input
> range. Here's another example that might be more helpful:
> 
>  *                                [---input range---]
>  *                                :                 :
>  * [--1G block pte--][--2MB--][--2MB--][--1G block pte--][--1G block pte--]
> 
> is split like this:
> 
>  * [--1G block pte--][--2MB--][ ][ ][ ][ ][ ][ ][ ][ ][ ][--1G block pte--]
>                               <-------split range------->
> 
> I think I will just use this new description instead.
> 
> > > + * Return: 0 on success, negative error code on failure. Note that
> > > + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> > > + * blocks in the input range as allowed by @mc_capacity.
> > > + */
> > > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > +			     void *mc, u64 mc_capacity);
> > > +
> > >  /**
> > >   * kvm_pgtable_walk() - Walk a page-table.
> > >   * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > index 6bdfcb671b32..3149b98d1701 100644
> > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > @@ -1259,6 +1259,119 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
> > >  	return pgtable;
> > >  }
> > >  
> > > +struct stage2_split_data {
> > > +	struct kvm_s2_mmu		*mmu;
> > > +	void				*memcache;
> > > +	u64				mc_capacity;
> > 
> > Why isn't this a pointer to a *real* memcache structure?
> > 
> 
> Mainly because I wanted this function to be like the other pgtable.c
> funtions that use opaque pointers to handle the vhe and nvhe cases. vhe
> uses "struct kvm_mmu_memory_cache" while nvhe uses "struct hyp_pool".
> This series only implements the vhe case but I didn't want to restrict
> kvm_pgtable_stage2_split() to vhe just because of this. Just in case, I
> have not tried it in nvhe.

Do you really mean nVHE here? or do you actually mean pKVM? The former
shouldn't be any different from VHE (the PT code runs in the same
context, give or take), and it is only the latter that is using
something else altogether.

And since pKVM cannot really do page splitting in the normal KVM
sense, this code shouldn't even be there!

> 
> > > +};
> > > +
> > > +/*
> > > + * Get the number of page-tables needed to replace a block with a
> > > + * fully populated tree, up to the PTE level, at particular level.
> > > + */
> > > +static inline int stage2_block_get_nr_page_tables(u32 level)
> > 
> > Please drop the inline. The compiler will figure it out.
> > 
> > > +{
> > > +	if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
> > > +			 level >= KVM_PGTABLE_MAX_LEVELS))
> > > +		return -EINVAL;
> > 
> > Move this check to the 'default' case below.
> > 
> > > +
> > > +	switch (level) {
> > > +	case 1:
> > > +		return PTRS_PER_PTE + 1;
> > > +	case 2:
> > > +		return 1;
> > 
> > This is odd. Replacing a block by a table always requires
> > 'PTRS_PER_PTE + 1' pages. Why 1? If this is some special treatment for
> > level-2 mappings, please spell it out.
> 
> I'm not sure I understand. I'm interpreting "level=X" as in "level X
> entry".  More specifically, using PAGE_SIZE=4096 as an example:
> 
> a level 3 entry (a PTE): a 4096 block needs 0 page-table pages
> a level 2 entry: a 2M block needs 1 page-table pages
> a level 1 entry: a 1G block needs 512+1 page-table pages

Ah, gotcha. I was reasoning at the block level, not at the entry
level. Maybe some extra idiot-proof explanation would help here for
the next time I look at this after having paged it out.

> 
> > 
> > > +	case 3:
> > > +		return 0;
> > > +	default:
> > > +		return -EINVAL;
> > > +	};
> > > +}
> > > +
> > > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > > +			       enum kvm_pgtable_walk_flags visit)
> > > +{
> > > +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> > > +	struct stage2_split_data *data = ctx->arg;
> > > +	kvm_pte_t pte = ctx->old, new, *childp;
> > > +	enum kvm_pgtable_prot prot;
> > > +	void *mc = data->memcache;
> > > +	u32 level = ctx->level;
> > > +	bool force_pte;
> > > +	int nr_pages;
> > > +	u64 phys;
> > > +
> > > +	/* No huge-pages exist at the last level */
> > > +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > > +		return 0;
> > 
> > Why the check for level 3 in the previous function if never get there?
> > 
> 
> Was trying to make stage2_block_get_nr_page_tables() useful for other
> cases. It's still correct for other cases to ask how many page-table
> pages are needed for a PTE (stage2_block_get_nr_page_tables(3) -> 0).

Right. I don't mind either way, but the double check somehow tickled
me.

> > > +
> > > +	/* We only split valid block mappings */
> > > +	if (!kvm_pte_valid(pte))
> > > +		return 0;
> > > +
> > > +	nr_pages = stage2_block_get_nr_page_tables(level);
> > > +	if (nr_pages < 0)
> > > +		return nr_pages;
> > > +
> > > +	if (data->mc_capacity >= nr_pages) {
> > > +		/* Build a tree mapped down to the PTE granularity. */
> > > +		force_pte = true;
> > > +	} else {
> > > +		/*
> > > +		 * Don't force PTEs. This requires a single page of PMDs at the
> > > +		 * PUD level, or a single page of PTEs at the PMD level. If we
> > > +		 * are at the PUD level, the PTEs will be created recursively.
> > > +		 */
> > 
> > I don't understand how you reach this 'single page' conclusion. You
> > need to explain why you get there.
> 
> Ack, will expand it.

Thanks. The above explanation you gave helped, so something long these
lines would be good.

Cheers,

	M.
Ricardo Koller March 15, 2023, 6:51 p.m. UTC | #4
On Wed, Mar 15, 2023 at 06:09:12PM +0000, Marc Zyngier wrote:
> On Mon, 13 Mar 2023 23:58:30 +0000,
> Ricardo Koller <ricarkol@google.com> wrote:
> > 
> > On Sun, Mar 12, 2023 at 11:35:01AM +0000, Marc Zyngier wrote:
> > > On Tue, 07 Mar 2023 03:45:47 +0000,
> > > Ricardo Koller <ricarkol@google.com> wrote:
> > > > 
> > > > Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> > > > range of huge pages. This will be used for eager-splitting huge pages
> > > > into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> > > > on write-protection faults, and instead use this function to do it
> > > > ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> > > > a time).
> > > > 
> > > > No functional change intended. This new function will be used in a
> > > > subsequent commit.
> > > 
> > > Same comment as before about the usefulness of this last sentence.
> > > 
> > > > 
> > > > Signed-off-by: Ricardo Koller <ricarkol@google.com>
> > > > Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
> > > > ---
> > > >  arch/arm64/include/asm/kvm_pgtable.h |  30 +++++++
> > > >  arch/arm64/kvm/hyp/pgtable.c         | 113 +++++++++++++++++++++++++++
> > > >  2 files changed, 143 insertions(+)
> > > > 
> > > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > > > index b7b3fc0fa7a5..40e323a718fc 100644
> > > > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > > > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > > > @@ -665,6 +665,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
> > > >   */
> > > >  int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
> > > >  
> > > > +/**
> > > > + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> > > > + *				to PAGE_SIZE guest pages.
> > > > + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> > > > + * @addr:	 Intermediate physical address from which to split.
> > > > + * @size:	 Size of the range.
> > > > + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> > > > + *		 page-table pages.
> > > > + * @mc_capacity: Number of pages in @mc.
> > > > + *
> > > > + * @addr and the end (@addr + @size) are effectively aligned down and up to
> > > > + * the top level huge-page block size. This is an example using 1GB
> > > > + * huge-pages and 4KB granules.
> > > > + *
> > > > + *                          [---input range---]
> > > > + *                          :                 :
> > > > + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> > > > + *                          :                 :
> > > > + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> > > > + *                          :                 :
> > > > + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> > > > + *                          :                 :
> > > 
> > > So here, what alignment do we effectively get?
> > >
> > 
> > The function tries to split any block that overlaps with the input
> > range. Here's another example that might be more helpful:
> > 
> >  *                                [---input range---]
> >  *                                :                 :
> >  * [--1G block pte--][--2MB--][--2MB--][--1G block pte--][--1G block pte--]
> > 
> > is split like this:
> > 
> >  * [--1G block pte--][--2MB--][ ][ ][ ][ ][ ][ ][ ][ ][ ][--1G block pte--]
> >                               <-------split range------->
> > 
> > I think I will just use this new description instead.
> > 
> > > > + * Return: 0 on success, negative error code on failure. Note that
> > > > + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> > > > + * blocks in the input range as allowed by @mc_capacity.
> > > > + */
> > > > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > > +			     void *mc, u64 mc_capacity);
> > > > +
> > > >  /**
> > > >   * kvm_pgtable_walk() - Walk a page-table.
> > > >   * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> > > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > > index 6bdfcb671b32..3149b98d1701 100644
> > > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > > @@ -1259,6 +1259,119 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
> > > >  	return pgtable;
> > > >  }
> > > >  
> > > > +struct stage2_split_data {
> > > > +	struct kvm_s2_mmu		*mmu;
> > > > +	void				*memcache;
> > > > +	u64				mc_capacity;
> > > 
> > > Why isn't this a pointer to a *real* memcache structure?
> > > 
> > 
> > Mainly because I wanted this function to be like the other pgtable.c
> > funtions that use opaque pointers to handle the vhe and nvhe cases. vhe
> > uses "struct kvm_mmu_memory_cache" while nvhe uses "struct hyp_pool".
> > This series only implements the vhe case but I didn't want to restrict
> > kvm_pgtable_stage2_split() to vhe just because of this. Just in case, I
> > have not tried it in nvhe.
> 
> Do you really mean nVHE here? or do you actually mean pKVM? The former
> shouldn't be any different from VHE (the PT code runs in the same
> context, give or take), and it is only the latter that is using
> something else altogether.
> 
> And since pKVM cannot really do page splitting in the normal KVM
> sense, this code shouldn't even be there!
> 

I see, then this should definitely be using "struct
kvm_mmu_memory_cache".  Thanks for the info.

> > 
> > > > +};
> > > > +
> > > > +/*
> > > > + * Get the number of page-tables needed to replace a block with a
> > > > + * fully populated tree, up to the PTE level, at particular level.
> > > > + */
> > > > +static inline int stage2_block_get_nr_page_tables(u32 level)
> > > 
> > > Please drop the inline. The compiler will figure it out.
> > > 
> > > > +{
> > > > +	if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
> > > > +			 level >= KVM_PGTABLE_MAX_LEVELS))
> > > > +		return -EINVAL;
> > > 
> > > Move this check to the 'default' case below.
> > > 
> > > > +
> > > > +	switch (level) {
> > > > +	case 1:
> > > > +		return PTRS_PER_PTE + 1;
> > > > +	case 2:
> > > > +		return 1;
> > > 
> > > This is odd. Replacing a block by a table always requires
> > > 'PTRS_PER_PTE + 1' pages. Why 1? If this is some special treatment for
> > > level-2 mappings, please spell it out.
> > 
> > I'm not sure I understand. I'm interpreting "level=X" as in "level X
> > entry".  More specifically, using PAGE_SIZE=4096 as an example:
> > 
> > a level 3 entry (a PTE): a 4096 block needs 0 page-table pages
> > a level 2 entry: a 2M block needs 1 page-table pages
> > a level 1 entry: a 1G block needs 512+1 page-table pages
> 
> Ah, gotcha. I was reasoning at the block level, not at the entry
> level. Maybe some extra idiot-proof explanation would help here for
> the next time I look at this after having paged it out.
> 
> > 
> > > 
> > > > +	case 3:
> > > > +		return 0;
> > > > +	default:
> > > > +		return -EINVAL;
> > > > +	};
> > > > +}
> > > > +
> > > > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > > > +			       enum kvm_pgtable_walk_flags visit)
> > > > +{
> > > > +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> > > > +	struct stage2_split_data *data = ctx->arg;
> > > > +	kvm_pte_t pte = ctx->old, new, *childp;
> > > > +	enum kvm_pgtable_prot prot;
> > > > +	void *mc = data->memcache;
> > > > +	u32 level = ctx->level;
> > > > +	bool force_pte;
> > > > +	int nr_pages;
> > > > +	u64 phys;
> > > > +
> > > > +	/* No huge-pages exist at the last level */
> > > > +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > > > +		return 0;
> > > 
> > > Why the check for level 3 in the previous function if never get there?
> > > 
> > 
> > Was trying to make stage2_block_get_nr_page_tables() useful for other
> > cases. It's still correct for other cases to ask how many page-table
> > pages are needed for a PTE (stage2_block_get_nr_page_tables(3) -> 0).
> 
> Right. I don't mind either way, but the double check somehow tickled
> me.
> 
> > > > +
> > > > +	/* We only split valid block mappings */
> > > > +	if (!kvm_pte_valid(pte))
> > > > +		return 0;
> > > > +
> > > > +	nr_pages = stage2_block_get_nr_page_tables(level);
> > > > +	if (nr_pages < 0)
> > > > +		return nr_pages;
> > > > +
> > > > +	if (data->mc_capacity >= nr_pages) {
> > > > +		/* Build a tree mapped down to the PTE granularity. */
> > > > +		force_pte = true;
> > > > +	} else {
> > > > +		/*
> > > > +		 * Don't force PTEs. This requires a single page of PMDs at the
> > > > +		 * PUD level, or a single page of PTEs at the PMD level. If we
> > > > +		 * are at the PUD level, the PTEs will be created recursively.
> > > > +		 */
> > > 
> > > I don't understand how you reach this 'single page' conclusion. You
> > > need to explain why you get there.
> > 
> > Ack, will expand it.
> 
> Thanks. The above explanation you gave helped, so something long these
> lines would be good.
> 
> Cheers,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index b7b3fc0fa7a5..40e323a718fc 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -665,6 +665,36 @@  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
  */
 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
+/**
+ * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
+ *				to PAGE_SIZE guest pages.
+ * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	 Intermediate physical address from which to split.
+ * @size:	 Size of the range.
+ * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
+ *		 page-table pages.
+ * @mc_capacity: Number of pages in @mc.
+ *
+ * @addr and the end (@addr + @size) are effectively aligned down and up to
+ * the top level huge-page block size. This is an example using 1GB
+ * huge-pages and 4KB granules.
+ *
+ *                          [---input range---]
+ *                          :                 :
+ * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
+ *                          :                 :
+ *                   [--2MB--][--2MB--][--2MB--][--2MB--]
+ *                          :                 :
+ *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
+ *                          :                 :
+ *
+ * Return: 0 on success, negative error code on failure. Note that
+ * kvm_pgtable_stage2_split() is best effort: it tries to break as many
+ * blocks in the input range as allowed by @mc_capacity.
+ */
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			     void *mc, u64 mc_capacity);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 6bdfcb671b32..3149b98d1701 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1259,6 +1259,119 @@  kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
 	return pgtable;
 }
 
+struct stage2_split_data {
+	struct kvm_s2_mmu		*mmu;
+	void				*memcache;
+	u64				mc_capacity;
+};
+
+/*
+ * Get the number of page-tables needed to replace a block with a
+ * fully populated tree, up to the PTE level, at particular level.
+ */
+static inline int stage2_block_get_nr_page_tables(u32 level)
+{
+	if (WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
+			 level >= KVM_PGTABLE_MAX_LEVELS))
+		return -EINVAL;
+
+	switch (level) {
+	case 1:
+		return PTRS_PER_PTE + 1;
+	case 2:
+		return 1;
+	case 3:
+		return 0;
+	default:
+		return -EINVAL;
+	};
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+			       enum kvm_pgtable_walk_flags visit)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+	struct stage2_split_data *data = ctx->arg;
+	kvm_pte_t pte = ctx->old, new, *childp;
+	enum kvm_pgtable_prot prot;
+	void *mc = data->memcache;
+	u32 level = ctx->level;
+	bool force_pte;
+	int nr_pages;
+	u64 phys;
+
+	/* No huge-pages exist at the last level */
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return 0;
+
+	/* We only split valid block mappings */
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	nr_pages = stage2_block_get_nr_page_tables(level);
+	if (nr_pages < 0)
+		return nr_pages;
+
+	if (data->mc_capacity >= nr_pages) {
+		/* Build a tree mapped down to the PTE granularity. */
+		force_pte = true;
+	} else {
+		/*
+		 * Don't force PTEs. This requires a single page of PMDs at the
+		 * PUD level, or a single page of PTEs at the PMD level. If we
+		 * are at the PUD level, the PTEs will be created recursively.
+		 */
+		force_pte = false;
+		nr_pages = 1;
+	}
+
+	if (data->mc_capacity < nr_pages)
+		return -ENOMEM;
+
+	phys = kvm_pte_to_phys(pte);
+	prot = kvm_pgtable_stage2_pte_prot(pte);
+
+	childp = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, phys,
+						    level, prot, mc, force_pte);
+	if (IS_ERR(childp))
+		return PTR_ERR(childp);
+
+	if (!stage2_try_break_pte(ctx, data->mmu)) {
+		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+		mm_ops->put_page(childp);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Note, the contents of the page table are guaranteed to be made
+	 * visible before the new PTE is assigned because stage2_make_pte()
+	 * writes the PTE using smp_store_release().
+	 */
+	new = kvm_init_table_pte(childp, mm_ops);
+	stage2_make_pte(ctx, new);
+	dsb(ishst);
+	data->mc_capacity -= nr_pages;
+	return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			     void *mc, u64 mc_capacity)
+{
+	struct stage2_split_data split_data = {
+		.mmu		= pgt->mmu,
+		.memcache	= mc,
+		.mc_capacity	= mc_capacity,
+	};
+
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_split_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg	= &split_data,
+	};
+
+	return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			      struct kvm_pgtable_mm_ops *mm_ops,
 			      enum kvm_pgtable_stage2_flags flags,