diff mbox series

[v2,04/12] KVM: arm64: Add kvm_pgtable_stage2_split()

Message ID 20230206165851.3106338-5-ricarkol@google.com (mailing list archive)
State New, archived
Headers show
Series Implement Eager Page Splitting for ARM. | expand

Commit Message

Ricardo Koller Feb. 6, 2023, 4:58 p.m. UTC
Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
range of huge pages. This will be used for eager-splitting huge pages
into PAGE_SIZE pages. The goal is to avoid having to split huge pages
on write-protection faults, and instead use this function to do it
ahead of time for large ranges (e.g., all guest memory in 1G chunks at
a time).

No functional change intended. This new function will be used in a
subsequent commit.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

Comments

Gavin Shan Feb. 9, 2023, 5:58 a.m. UTC | #1
Hi Ricardo,

On 2/7/23 3:58 AM, Ricardo Koller wrote:
> Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> range of huge pages. This will be used for eager-splitting huge pages
> into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> on write-protection faults, and instead use this function to do it
> ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> a time).
> 
> No functional change intended. This new function will be used in a
> subsequent commit.
> 
> Signed-off-by: Ricardo Koller <ricarkol@google.com>
> ---
>   arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
>   arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
>   2 files changed, 135 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index e94c92988745..871c4eeb0184 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -658,6 +658,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
>    */
>   int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
>   
> +/**
> + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> + *				to PAGE_SIZE guest pages.
> + * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
> + * @addr:	 Intermediate physical address from which to split.
> + * @size:	 Size of the range.
> + * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
> + *		 page-table pages.
> + * @mc_capacity: Number of pages in @mc.
> + *
> + * @addr and the end (@addr + @size) are effectively aligned down and up to
> + * the top level huge-page block size. This is an example using 1GB
> + * huge-pages and 4KB granules.
> + *
> + *                          [---input range---]
> + *                          :                 :
> + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> + *                          :                 :
> + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> + *                          :                 :
> + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> + *                          :                 :
> + *
> + * Return: 0 on success, negative error code on failure. Note that
> + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> + * blocks in the input range as allowed by @mc_capacity.
> + */
> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity);
> +
>   /**
>    * kvm_pgtable_walk() - Walk a page-table.
>    * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index fed314f2b320..ae80845c8db7 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
>   	return 0;
>   }
>   
> +struct stage2_split_data {
> +	struct kvm_s2_mmu		*mmu;
> +	void				*memcache;
> +	u64				mc_capacity;
> +};
> +
> +/*
> + * Get the number of page-tables needed to replace a bock with a fully
> + * populated tree, up to the PTE level, at particular level.
> + */
> +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> +{
> +	switch (level) {
> +	/* There are no blocks at level 0 */
> +	case 1: return 1 + PTRS_PER_PTE;
> +	case 2: return 1;
> +	case 3: return 0;
> +	default:
> +		WARN_ON_ONCE(1);
> +		return ~0;
> +	}
> +}
> +
> +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> +			       enum kvm_pgtable_walk_flags visit)
> +{
> +	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> +	struct stage2_split_data *data = ctx->arg;
> +	kvm_pte_t pte = ctx->old, new, *childp;
> +	enum kvm_pgtable_prot prot;
> +	void *mc = data->memcache;
> +	u32 level = ctx->level;
> +	u64 phys, nr_pages;
> +	bool force_pte;
> +	int ret;
> +
> +	/* No huge-pages exist at the last level */
> +	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> +		return 0;
> +
> +	/* We only split valid block mappings */
> +	if (!kvm_pte_valid(pte) || kvm_pte_table(pte, ctx->level))
> +		return 0;
> +

Since stage2_split_walker() has been specified as a leaf walker by KVM_PGTABLE_WALK_LEAF,
I don't understand how kvm_pte_table() can return true.

> +	nr_pages = stage2_block_get_nr_page_tables(level);
> +	if (data->mc_capacity >= nr_pages) {
> +		/* Build a tree mapped down to the PTE granularity. */
> +		force_pte = true;
> +	} else {
> +		/*
> +		 * Don't force PTEs. This requires a single page of PMDs at the
> +		 * PUD level, or a single page of PTEs at the PMD level. If we
> +		 * are at the PUD level, the PTEs will be created recursively.
> +		 */
> +		force_pte = false;
> +		nr_pages = 1;
> +	}
> +
> +	if (data->mc_capacity < nr_pages)
> +		return -ENOMEM;
> +
> +	phys = kvm_pte_to_phys(pte);
> +	prot = kvm_pgtable_stage2_pte_prot(pte);
> +
> +	ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> +						 level, prot, mc, force_pte);
> +	if (ret)
> +		return ret;
> +
> +	if (!stage2_try_break_pte(ctx, data->mmu)) {
> +		childp = kvm_pte_follow(new, mm_ops);
> +		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> +		mm_ops->put_page(childp);
> +		return -EAGAIN;
> +	}
> +
> +	/*
> +	 * Note, the contents of the page table are guaranteed to be made
> +	 * visible before the new PTE is assigned because stage2_make_pte()
> +	 * writes the PTE using smp_store_release().
> +	 */
> +	stage2_make_pte(ctx, new);
> +	dsb(ishst);
> +	data->mc_capacity -= nr_pages;
> +	return 0;
> +}
> +

I think it's possible 'data->mc_capability' to be replaced by 'mc->nobjs'
because they're same thing. With this, we needn't to maintain a duplicate
'data->mc_capability' since 'data->mc' has been existing.

> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> +			     void *mc, u64 mc_capacity)
> +{
> +	struct stage2_split_data split_data = {
> +		.mmu		= pgt->mmu,
> +		.memcache	= mc,
> +		.mc_capacity	= mc_capacity,
> +	};
> +
> +	struct kvm_pgtable_walker walker = {
> +		.cb	= stage2_split_walker,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +		.arg	= &split_data,
> +	};
> +
> +	return kvm_pgtable_walk(pgt, addr, size, &walker);
> +}
> +
>   int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
>   			      struct kvm_pgtable_mm_ops *mm_ops,
>   			      enum kvm_pgtable_stage2_flags flags,
> 

Thanks,
Gavin
Ricardo Koller Feb. 9, 2023, 12:40 p.m. UTC | #2
On Wed, Feb 8, 2023 at 9:58 PM Gavin Shan <gshan@redhat.com> wrote:
>
> Hi Ricardo,
>
> On 2/7/23 3:58 AM, Ricardo Koller wrote:
> > Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
> > range of huge pages. This will be used for eager-splitting huge pages
> > into PAGE_SIZE pages. The goal is to avoid having to split huge pages
> > on write-protection faults, and instead use this function to do it
> > ahead of time for large ranges (e.g., all guest memory in 1G chunks at
> > a time).
> >
> > No functional change intended. This new function will be used in a
> > subsequent commit.
> >
> > Signed-off-by: Ricardo Koller <ricarkol@google.com>
> > ---
> >   arch/arm64/include/asm/kvm_pgtable.h |  30 ++++++++
> >   arch/arm64/kvm/hyp/pgtable.c         | 105 +++++++++++++++++++++++++++
> >   2 files changed, 135 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> > index e94c92988745..871c4eeb0184 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -658,6 +658,36 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
> >    */
> >   int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
> >
> > +/**
> > + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
> > + *                           to PAGE_SIZE guest pages.
> > + * @pgt:      Page-table structure initialised by kvm_pgtable_stage2_init().
> > + * @addr:     Intermediate physical address from which to split.
> > + * @size:     Size of the range.
> > + * @mc:               Cache of pre-allocated and zeroed memory from which to allocate
> > + *            page-table pages.
> > + * @mc_capacity: Number of pages in @mc.
> > + *
> > + * @addr and the end (@addr + @size) are effectively aligned down and up to
> > + * the top level huge-page block size. This is an example using 1GB
> > + * huge-pages and 4KB granules.
> > + *
> > + *                          [---input range---]
> > + *                          :                 :
> > + * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
> > + *                          :                 :
> > + *                   [--2MB--][--2MB--][--2MB--][--2MB--]
> > + *                          :                 :
> > + *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
> > + *                          :                 :
> > + *
> > + * Return: 0 on success, negative error code on failure. Note that
> > + * kvm_pgtable_stage2_split() is best effort: it tries to break as many
> > + * blocks in the input range as allowed by @mc_capacity.
> > + */
> > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +                          void *mc, u64 mc_capacity);
> > +
> >   /**
> >    * kvm_pgtable_walk() - Walk a page-table.
> >    * @pgt:    Page-table structure initialised by kvm_pgtable_*_init().
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index fed314f2b320..ae80845c8db7 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -1229,6 +1229,111 @@ int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
> >       return 0;
> >   }
> >
> > +struct stage2_split_data {
> > +     struct kvm_s2_mmu               *mmu;
> > +     void                            *memcache;
> > +     u64                             mc_capacity;
> > +};
> > +
> > +/*
> > + * Get the number of page-tables needed to replace a bock with a fully
> > + * populated tree, up to the PTE level, at particular level.
> > + */
> > +static inline u32 stage2_block_get_nr_page_tables(u32 level)
> > +{
> > +     switch (level) {
> > +     /* There are no blocks at level 0 */
> > +     case 1: return 1 + PTRS_PER_PTE;
> > +     case 2: return 1;
> > +     case 3: return 0;
> > +     default:
> > +             WARN_ON_ONCE(1);
> > +             return ~0;
> > +     }
> > +}
> > +
> > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > +                            enum kvm_pgtable_walk_flags visit)
> > +{
> > +     struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> > +     struct stage2_split_data *data = ctx->arg;
> > +     kvm_pte_t pte = ctx->old, new, *childp;
> > +     enum kvm_pgtable_prot prot;
> > +     void *mc = data->memcache;
> > +     u32 level = ctx->level;
> > +     u64 phys, nr_pages;
> > +     bool force_pte;
> > +     int ret;
> > +
> > +     /* No huge-pages exist at the last level */
> > +     if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > +             return 0;
> > +
> > +     /* We only split valid block mappings */
> > +     if (!kvm_pte_valid(pte) || kvm_pte_table(pte, ctx->level))
> > +             return 0;
> > +
>
> Since stage2_split_walker() has been specified as a leaf walker by KVM_PGTABLE_WALK_LEAF,
> I don't understand how kvm_pte_table() can return true.

Good point. Will remove it. This check made sense for a previous
version (the RFC),
but not anymore.

>
> > +     nr_pages = stage2_block_get_nr_page_tables(level);
> > +     if (data->mc_capacity >= nr_pages) {
> > +             /* Build a tree mapped down to the PTE granularity. */
> > +             force_pte = true;
> > +     } else {
> > +             /*
> > +              * Don't force PTEs. This requires a single page of PMDs at the
> > +              * PUD level, or a single page of PTEs at the PMD level. If we
> > +              * are at the PUD level, the PTEs will be created recursively.
> > +              */
> > +             force_pte = false;
> > +             nr_pages = 1;
> > +     }
> > +
> > +     if (data->mc_capacity < nr_pages)
> > +             return -ENOMEM;
> > +
> > +     phys = kvm_pte_to_phys(pte);
> > +     prot = kvm_pgtable_stage2_pte_prot(pte);
> > +
> > +     ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> > +                                              level, prot, mc, force_pte);
> > +     if (ret)
> > +             return ret;
> > +
> > +     if (!stage2_try_break_pte(ctx, data->mmu)) {
> > +             childp = kvm_pte_follow(new, mm_ops);
> > +             kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> > +             mm_ops->put_page(childp);
> > +             return -EAGAIN;
> > +     }
> > +
> > +     /*
> > +      * Note, the contents of the page table are guaranteed to be made
> > +      * visible before the new PTE is assigned because stage2_make_pte()
> > +      * writes the PTE using smp_store_release().
> > +      */
> > +     stage2_make_pte(ctx, new);
> > +     dsb(ishst);
> > +     data->mc_capacity -= nr_pages;
> > +     return 0;
> > +}
> > +
>
> I think it's possible 'data->mc_capability' to be replaced by 'mc->nobjs'
> because they're same thing. With this, we needn't to maintain a duplicate
> 'data->mc_capability' since 'data->mc' has been existing.

Ah, nice, yes. That would be simpler.

Thanks!
Ricardo

>
> > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > +                          void *mc, u64 mc_capacity)
> > +{
> > +     struct stage2_split_data split_data = {
> > +             .mmu            = pgt->mmu,
> > +             .memcache       = mc,
> > +             .mc_capacity    = mc_capacity,
> > +     };
> > +
> > +     struct kvm_pgtable_walker walker = {
> > +             .cb     = stage2_split_walker,
> > +             .flags  = KVM_PGTABLE_WALK_LEAF,
> > +             .arg    = &split_data,
> > +     };
> > +
> > +     return kvm_pgtable_walk(pgt, addr, size, &walker);
> > +}
> > +
> >   int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
> >                             struct kvm_pgtable_mm_ops *mm_ops,
> >                             enum kvm_pgtable_stage2_flags flags,
> >
>
> Thanks,
> Gavin
>
Ricardo Koller Feb. 9, 2023, 4:17 p.m. UTC | #3
"(> > > +     if (data->mc_capacity < nr_pages)
> > > +             return -ENOMEM;
> > > +
> > > +     phys = kvm_pte_to_phys(pte);
> > > +     prot = kvm_pgtable_stage2_pte_prot(pte);
> > > +
> > > +     ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> > > +                                              level, prot, mc, force_pte);
> > > +     if (ret)
> > > +             return ret;
> > > +
> > > +     if (!stage2_try_break_pte(ctx, data->mmu)) {
> > > +             childp = kvm_pte_follow(new, mm_ops);
> > > +             kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> > > +             mm_ops->put_page(childp);
> > > +             return -EAGAIN;
> > > +     }
> > > +
> > > +     /*
> > > +      * Note, the contents of the page table are guaranteed to be made
> > > +      * visible before the new PTE is assigned because stage2_make_pte()
> > > +      * writes the PTE using smp_store_release().
> > > +      */
> > > +     stage2_make_pte(ctx, new);
> > > +     dsb(ishst);
> > > +     data->mc_capacity -= nr_pages;
> > > +     return 0;
> > > +}
> > > +
> >
> > I think it's possible 'data->mc_capability' to be replaced by 'mc->nobjs'
> > because they're same thing. With this, we needn't to maintain a duplicate
> > 'data->mc_capability' since 'data->mc' has been existing.
>
> Ah, nice, yes. That would be simpler.
>

Actually, there's a complication. The memcache details are hidden
inside of pgtable.c,
so different types of memcaches (for vhe and nvhe) can be used for allocations.
Specifically, the memcache objects are passed as an opaque pointer ("void *")
and can be used with "struct hyp_pool" and "struct kvm_mmu_memory_cache".

So, here are all the options that I can think of:

        1. stage2_split_walker() is just used on the VHE case with the
        "struct kvm_mmu_memory_cache" memcache, so we could just use it
        instead of a "void *":

                kvm_pgtable_stage2_split(..., struct kvm_mmu_memory_cache *mc);

        However, it could be used for the NVHE case as well, plus
        this would go against the overall design of pgtable.c which tries
        to use opaque objects for most things.

        2. add a "get_nobjs()" method to both memcaches. This is tricky
        because "struct hyp_pool" doesn't directly track its capacity. I
        would rather not mess with it.

        3. This whole accounting of available pages in the memcache is
        needed because of the way I implemented stage2_split_walker() and
        the memcache interface.  stage2_split_walker() tries to allocate
        as many pages for the new table as allowed by the capacity of the
        memcache. The issue with blindingly trying until the allocation
        fails is that kvm_mmu_memory_cache_alloc() WARNs and tries to
        allocate using GFP_ATOMIC when !nobjs. We don't want to do that,
        so we could extend kvm_pgtable_mm_ops.zalloc_page() with a
        NO_GFP_ATOMIC_ON_EMPTY (or similar). This flag would have to be
        ignored on the hyp side.

        4. what this patch is currently doing: tracking the capacity by
        hand.

I prefer options 4 and 3. WDYT?

Thanks,
Ricardo

> Thanks!
> Ricardo
>
> >
> > > +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > > +                          void *mc, u64 mc_capacity)
> > > +{
> > > +     struct stage2_split_data split_data = {
> > > +             .mmu            = pgt->mmu,
> > > +             .memcache       = mc,
> > > +             .mc_capacity    = mc_capacity,
> > > +     };
> > > +
> > > +     struct kvm_pgtable_walker walker = {
> > > +             .cb     = stage2_split_walker,
> > > +             .flags  = KVM_PGTABLE_WALK_LEAF,
> > > +             .arg    = &split_data,
> > > +     };
> > > +
> > > +     return kvm_pgtable_walk(pgt, addr, size, &walker);
> > > +}
> > > +
> > >   int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
> > >                             struct kvm_pgtable_mm_ops *mm_ops,
> > >                             enum kvm_pgtable_stage2_flags flags,
> > >
> >
> > Thanks,
> > Gavin
> >
Gavin Shan Feb. 9, 2023, 10:48 p.m. UTC | #4
On 2/10/23 3:17 AM, Ricardo Koller wrote:
> "(> > > +     if (data->mc_capacity < nr_pages)
>>>> +             return -ENOMEM;
>>>> +
>>>> +     phys = kvm_pte_to_phys(pte);
>>>> +     prot = kvm_pgtable_stage2_pte_prot(pte);
>>>> +
>>>> +     ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
>>>> +                                              level, prot, mc, force_pte);
>>>> +     if (ret)
>>>> +             return ret;
>>>> +
>>>> +     if (!stage2_try_break_pte(ctx, data->mmu)) {
>>>> +             childp = kvm_pte_follow(new, mm_ops);
>>>> +             kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
>>>> +             mm_ops->put_page(childp);
>>>> +             return -EAGAIN;
>>>> +     }
>>>> +
>>>> +     /*
>>>> +      * Note, the contents of the page table are guaranteed to be made
>>>> +      * visible before the new PTE is assigned because stage2_make_pte()
>>>> +      * writes the PTE using smp_store_release().
>>>> +      */
>>>> +     stage2_make_pte(ctx, new);
>>>> +     dsb(ishst);
>>>> +     data->mc_capacity -= nr_pages;
>>>> +     return 0;
>>>> +}
>>>> +
>>>
>>> I think it's possible 'data->mc_capability' to be replaced by 'mc->nobjs'
>>> because they're same thing. With this, we needn't to maintain a duplicate
>>> 'data->mc_capability' since 'data->mc' has been existing.
>>
>> Ah, nice, yes. That would be simpler.
>>
> 
> Actually, there's a complication. The memcache details are hidden
> inside of pgtable.c,
> so different types of memcaches (for vhe and nvhe) can be used for allocations.
> Specifically, the memcache objects are passed as an opaque pointer ("void *")
> and can be used with "struct hyp_pool" and "struct kvm_mmu_memory_cache".
> 
> So, here are all the options that I can think of:
> 
>          1. stage2_split_walker() is just used on the VHE case with the
>          "struct kvm_mmu_memory_cache" memcache, so we could just use it
>          instead of a "void *":
> 
>                  kvm_pgtable_stage2_split(..., struct kvm_mmu_memory_cache *mc);
> 
>          However, it could be used for the NVHE case as well, plus
>          this would go against the overall design of pgtable.c which tries
>          to use opaque objects for most things.
> 
>          2. add a "get_nobjs()" method to both memcaches. This is tricky
>          because "struct hyp_pool" doesn't directly track its capacity. I
>          would rather not mess with it.
> 
>          3. This whole accounting of available pages in the memcache is
>          needed because of the way I implemented stage2_split_walker() and
>          the memcache interface.  stage2_split_walker() tries to allocate
>          as many pages for the new table as allowed by the capacity of the
>          memcache. The issue with blindingly trying until the allocation
>          fails is that kvm_mmu_memory_cache_alloc() WARNs and tries to
>          allocate using GFP_ATOMIC when !nobjs. We don't want to do that,
>          so we could extend kvm_pgtable_mm_ops.zalloc_page() with a
>          NO_GFP_ATOMIC_ON_EMPTY (or similar). This flag would have to be
>          ignored on the hyp side.
> 
>          4. what this patch is currently doing: tracking the capacity by
>          hand.
> 
> I prefer options 4 and 3. WDYT?
> 

Yeah, stage2_split_walker() is currently only used by VHE and it calls to
stage2_map_walker() to create the unlinked page table, which is shared by
VHE and nVHE. I think option 3 would be better than 4 because we generally
just want to fetch the pre-allocated page table, instead of allocating a
new page table with GFP_ATOMIC. However, I'm also fine with option 4. I
think this can be improved in the future if you agree.

>>
>>>
>>>> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
>>>> +                          void *mc, u64 mc_capacity)
>>>> +{
>>>> +     struct stage2_split_data split_data = {
>>>> +             .mmu            = pgt->mmu,
>>>> +             .memcache       = mc,
>>>> +             .mc_capacity    = mc_capacity,
>>>> +     };
>>>> +
>>>> +     struct kvm_pgtable_walker walker = {
>>>> +             .cb     = stage2_split_walker,
>>>> +             .flags  = KVM_PGTABLE_WALK_LEAF,
>>>> +             .arg    = &split_data,
>>>> +     };
>>>> +
>>>> +     return kvm_pgtable_walk(pgt, addr, size, &walker);
>>>> +}
>>>> +
>>>>    int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
>>>>                              struct kvm_pgtable_mm_ops *mm_ops,
>>>>                              enum kvm_pgtable_stage2_flags flags,
>>>>
>>>

Thanks,
Gavin
Ricardo Koller Feb. 15, 2023, 5:43 p.m. UTC | #5
On Thu, Feb 9, 2023 at 2:48 PM Gavin Shan <gshan@redhat.com> wrote:
>
> On 2/10/23 3:17 AM, Ricardo Koller wrote:
> > "(> > > +     if (data->mc_capacity < nr_pages)
> >>>> +             return -ENOMEM;
> >>>> +
> >>>> +     phys = kvm_pte_to_phys(pte);
> >>>> +     prot = kvm_pgtable_stage2_pte_prot(pte);
> >>>> +
> >>>> +     ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
> >>>> +                                              level, prot, mc, force_pte);
> >>>> +     if (ret)
> >>>> +             return ret;
> >>>> +
> >>>> +     if (!stage2_try_break_pte(ctx, data->mmu)) {
> >>>> +             childp = kvm_pte_follow(new, mm_ops);
> >>>> +             kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
> >>>> +             mm_ops->put_page(childp);
> >>>> +             return -EAGAIN;
> >>>> +     }
> >>>> +
> >>>> +     /*
> >>>> +      * Note, the contents of the page table are guaranteed to be made
> >>>> +      * visible before the new PTE is assigned because stage2_make_pte()
> >>>> +      * writes the PTE using smp_store_release().
> >>>> +      */
> >>>> +     stage2_make_pte(ctx, new);
> >>>> +     dsb(ishst);
> >>>> +     data->mc_capacity -= nr_pages;
> >>>> +     return 0;
> >>>> +}
> >>>> +
> >>>
> >>> I think it's possible 'data->mc_capability' to be replaced by 'mc->nobjs'
> >>> because they're same thing. With this, we needn't to maintain a duplicate
> >>> 'data->mc_capability' since 'data->mc' has been existing.
> >>
> >> Ah, nice, yes. That would be simpler.
> >>
> >
> > Actually, there's a complication. The memcache details are hidden
> > inside of pgtable.c,
> > so different types of memcaches (for vhe and nvhe) can be used for allocations.
> > Specifically, the memcache objects are passed as an opaque pointer ("void *")
> > and can be used with "struct hyp_pool" and "struct kvm_mmu_memory_cache".
> >
> > So, here are all the options that I can think of:
> >
> >          1. stage2_split_walker() is just used on the VHE case with the
> >          "struct kvm_mmu_memory_cache" memcache, so we could just use it
> >          instead of a "void *":
> >
> >                  kvm_pgtable_stage2_split(..., struct kvm_mmu_memory_cache *mc);
> >
> >          However, it could be used for the NVHE case as well, plus
> >          this would go against the overall design of pgtable.c which tries
> >          to use opaque objects for most things.
> >
> >          2. add a "get_nobjs()" method to both memcaches. This is tricky
> >          because "struct hyp_pool" doesn't directly track its capacity. I
> >          would rather not mess with it.
> >
> >          3. This whole accounting of available pages in the memcache is
> >          needed because of the way I implemented stage2_split_walker() and
> >          the memcache interface.  stage2_split_walker() tries to allocate
> >          as many pages for the new table as allowed by the capacity of the
> >          memcache. The issue with blindingly trying until the allocation
> >          fails is that kvm_mmu_memory_cache_alloc() WARNs and tries to
> >          allocate using GFP_ATOMIC when !nobjs. We don't want to do that,
> >          so we could extend kvm_pgtable_mm_ops.zalloc_page() with a
> >          NO_GFP_ATOMIC_ON_EMPTY (or similar). This flag would have to be
> >          ignored on the hyp side.
> >
> >          4. what this patch is currently doing: tracking the capacity by
> >          hand.
> >
> > I prefer options 4 and 3. WDYT?
> >
>
> Yeah, stage2_split_walker() is currently only used by VHE and it calls to
> stage2_map_walker() to create the unlinked page table, which is shared by
> VHE and nVHE. I think option 3 would be better than 4 because we generally
> just want to fetch the pre-allocated page table, instead of allocating a
> new page table with GFP_ATOMIC. However, I'm also fine with option 4. I

Going with option 4 then for version 3.

> think this can be improved in the future if you agree.

Definitely!

Thanks,
Ricardo

>
> >>
> >>>
> >>>> +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >>>> +                          void *mc, u64 mc_capacity)
> >>>> +{
> >>>> +     struct stage2_split_data split_data = {
> >>>> +             .mmu            = pgt->mmu,
> >>>> +             .memcache       = mc,
> >>>> +             .mc_capacity    = mc_capacity,
> >>>> +     };
> >>>> +
> >>>> +     struct kvm_pgtable_walker walker = {
> >>>> +             .cb     = stage2_split_walker,
> >>>> +             .flags  = KVM_PGTABLE_WALK_LEAF,
> >>>> +             .arg    = &split_data,
> >>>> +     };
> >>>> +
> >>>> +     return kvm_pgtable_walk(pgt, addr, size, &walker);
> >>>> +}
> >>>> +
> >>>>    int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
> >>>>                              struct kvm_pgtable_mm_ops *mm_ops,
> >>>>                              enum kvm_pgtable_stage2_flags flags,
> >>>>
> >>>
>
> Thanks,
> Gavin
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index e94c92988745..871c4eeb0184 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -658,6 +658,36 @@  bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
  */
 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
+/**
+ * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
+ *				to PAGE_SIZE guest pages.
+ * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	 Intermediate physical address from which to split.
+ * @size:	 Size of the range.
+ * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
+ *		 page-table pages.
+ * @mc_capacity: Number of pages in @mc.
+ *
+ * @addr and the end (@addr + @size) are effectively aligned down and up to
+ * the top level huge-page block size. This is an example using 1GB
+ * huge-pages and 4KB granules.
+ *
+ *                          [---input range---]
+ *                          :                 :
+ * [--1G block pte--][--1G block pte--][--1G block pte--][--1G block pte--]
+ *                          :                 :
+ *                   [--2MB--][--2MB--][--2MB--][--2MB--]
+ *                          :                 :
+ *                   [ ][ ][:][ ][ ][ ][ ][ ][:][ ][ ][ ]
+ *                          :                 :
+ *
+ * Return: 0 on success, negative error code on failure. Note that
+ * kvm_pgtable_stage2_split() is best effort: it tries to break as many
+ * blocks in the input range as allowed by @mc_capacity.
+ */
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			     void *mc, u64 mc_capacity);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index fed314f2b320..ae80845c8db7 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1229,6 +1229,111 @@  int kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
 	return 0;
 }
 
+struct stage2_split_data {
+	struct kvm_s2_mmu		*mmu;
+	void				*memcache;
+	u64				mc_capacity;
+};
+
+/*
+ * Get the number of page-tables needed to replace a bock with a fully
+ * populated tree, up to the PTE level, at particular level.
+ */
+static inline u32 stage2_block_get_nr_page_tables(u32 level)
+{
+	switch (level) {
+	/* There are no blocks at level 0 */
+	case 1: return 1 + PTRS_PER_PTE;
+	case 2: return 1;
+	case 3: return 0;
+	default:
+		WARN_ON_ONCE(1);
+		return ~0;
+	}
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+			       enum kvm_pgtable_walk_flags visit)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+	struct stage2_split_data *data = ctx->arg;
+	kvm_pte_t pte = ctx->old, new, *childp;
+	enum kvm_pgtable_prot prot;
+	void *mc = data->memcache;
+	u32 level = ctx->level;
+	u64 phys, nr_pages;
+	bool force_pte;
+	int ret;
+
+	/* No huge-pages exist at the last level */
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return 0;
+
+	/* We only split valid block mappings */
+	if (!kvm_pte_valid(pte) || kvm_pte_table(pte, ctx->level))
+		return 0;
+
+	nr_pages = stage2_block_get_nr_page_tables(level);
+	if (data->mc_capacity >= nr_pages) {
+		/* Build a tree mapped down to the PTE granularity. */
+		force_pte = true;
+	} else {
+		/*
+		 * Don't force PTEs. This requires a single page of PMDs at the
+		 * PUD level, or a single page of PTEs at the PMD level. If we
+		 * are at the PUD level, the PTEs will be created recursively.
+		 */
+		force_pte = false;
+		nr_pages = 1;
+	}
+
+	if (data->mc_capacity < nr_pages)
+		return -ENOMEM;
+
+	phys = kvm_pte_to_phys(pte);
+	prot = kvm_pgtable_stage2_pte_prot(pte);
+
+	ret = kvm_pgtable_stage2_create_unlinked(data->mmu->pgt, &new, phys,
+						 level, prot, mc, force_pte);
+	if (ret)
+		return ret;
+
+	if (!stage2_try_break_pte(ctx, data->mmu)) {
+		childp = kvm_pte_follow(new, mm_ops);
+		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+		mm_ops->put_page(childp);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Note, the contents of the page table are guaranteed to be made
+	 * visible before the new PTE is assigned because stage2_make_pte()
+	 * writes the PTE using smp_store_release().
+	 */
+	stage2_make_pte(ctx, new);
+	dsb(ishst);
+	data->mc_capacity -= nr_pages;
+	return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			     void *mc, u64 mc_capacity)
+{
+	struct stage2_split_data split_data = {
+		.mmu		= pgt->mmu,
+		.memcache	= mc,
+		.mc_capacity	= mc_capacity,
+	};
+
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_split_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg	= &split_data,
+	};
+
+	return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			      struct kvm_pgtable_mm_ops *mm_ops,
 			      enum kvm_pgtable_stage2_flags flags,