diff mbox series

[v2,12/19] arm64: mm: Add definitions to support 5 levels of paging

Message ID 20221124123932.2648991-13-ardb@kernel.org (mailing list archive)
State New, archived
Headers show
Series arm64: Enable LPA2 support for 4k and 16k pages | expand

Commit Message

Ard Biesheuvel Nov. 24, 2022, 12:39 p.m. UTC
Add the required types and descriptor accessors to support 5 levels of
paging in the common code. This is one of the prerequisites for
supporting 52-bit virtual addressing with 4k pages.

Note that this does not cover the code that handles kernel mappings or
the fixmap.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
 arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
 arch/arm64/include/asm/pgtable-types.h |  6 ++
 arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
 arch/arm64/mm/mmu.c                    | 31 +++++++-
 arch/arm64/mm/pgd.c                    | 15 +++-
 6 files changed, 181 insertions(+), 9 deletions(-)

Comments

Ryan Roberts Nov. 28, 2022, 4:17 p.m. UTC | #1
On 24/11/2022 12:39, Ard Biesheuvel wrote:
> Add the required types and descriptor accessors to support 5 levels of
> paging in the common code. This is one of the prerequisites for
> supporting 52-bit virtual addressing with 4k pages.
> 
> Note that this does not cover the code that handles kernel mappings or
> the fixmap.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
>  arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
>  arch/arm64/include/asm/pgtable-types.h |  6 ++
>  arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
>  arch/arm64/mm/mmu.c                    | 31 +++++++-
>  arch/arm64/mm/pgd.c                    | 15 +++-
>  6 files changed, 181 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> index 237224484d0f..cae8c648f462 100644
> --- a/arch/arm64/include/asm/pgalloc.h
> +++ b/arch/arm64/include/asm/pgalloc.h
> @@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
>  }
>  #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
>  
> +#if CONFIG_PGTABLE_LEVELS > 4
> +
> +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
> +{
> +	if (pgtable_l5_enabled())
> +		set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
> +}
> +
> +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
> +{
> +	pgdval_t pgdval = PGD_TYPE_TABLE;
> +
> +	pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
> +	__pgd_populate(pgdp, __pa(p4dp), pgdval);
> +}
> +
> +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> +	gfp_t gfp = GFP_PGTABLE_USER;
> +
> +	if (mm == &init_mm)
> +		gfp = GFP_PGTABLE_KERNEL;
> +	return (p4d_t *)get_zeroed_page(gfp);
> +}
> +
> +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
> +{
> +	if (!pgtable_l5_enabled())
> +		return;
> +	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
> +	free_page((unsigned long)p4d);
> +}
> +
> +#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
> +#else
> +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
> +{
> +	BUILD_BUG();
> +}
> +#endif	/* CONFIG_PGTABLE_LEVELS > 4 */
> +
>  extern pgd_t *pgd_alloc(struct mm_struct *mm);
>  extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
>  
> diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
> index b91fe4781b06..b364b02e696b 100644
> --- a/arch/arm64/include/asm/pgtable-hwdef.h
> +++ b/arch/arm64/include/asm/pgtable-hwdef.h
> @@ -26,10 +26,10 @@
>  #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
>  
>  /*
> - * Size mapped by an entry at level n ( 0 <= n <= 3)
> + * Size mapped by an entry at level n ( -1 <= n <= 3)
>   * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
>   * in the final page. The maximum number of translation levels supported by
> - * the architecture is 4. Hence, starting at level n, we have further
> + * the architecture is 5. Hence, starting at level n, we have further
>   * ((4 - n) - 1) levels of translation excluding the offset within the page.
>   * So, the total number of bits mapped by an entry at level n is :
>   *

Is it neccessary to represent the levels as (-1 - 3) in the kernel or are you
open to switching to (0 - 4)?

There are a couple of other places where translation level is used, which I
found and fixed up for the KVM LPA2 support work. It got a bit messy to
represent the levels using the architectural range (-1 - 3) so I ended up
representing them as (0 - 4). The main issue was that KVM represents level as
unsigned so that change would have looked quite big.

Most of this is confined to KVM and the only place it really crosses over with
the kernel is at __tlbi_level(). Which makes me think you might be missing some
required changes (I didn't notice these in your other patches):

Looking at the TLB management stuff, I think there are some places you will need
to fix up to correctly handle the extra level in the kernel (e.g.
tlb_get_level(), flush_tlb_range()).

There are some new ecodings for level in the FSC field in the ESR. You might
need to update the fault_info array in fault.c to represent these and correctly
handle user space faults for the new level?


> [...]

Thanks,
Ryan
Ard Biesheuvel Nov. 28, 2022, 4:22 p.m. UTC | #2
On Mon, 28 Nov 2022 at 17:17, Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 24/11/2022 12:39, Ard Biesheuvel wrote:
> > Add the required types and descriptor accessors to support 5 levels of
> > paging in the common code. This is one of the prerequisites for
> > supporting 52-bit virtual addressing with 4k pages.
> >
> > Note that this does not cover the code that handles kernel mappings or
> > the fixmap.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
> >  arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
> >  arch/arm64/include/asm/pgtable-types.h |  6 ++
> >  arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
> >  arch/arm64/mm/mmu.c                    | 31 +++++++-
> >  arch/arm64/mm/pgd.c                    | 15 +++-
> >  6 files changed, 181 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> > index 237224484d0f..cae8c648f462 100644
> > --- a/arch/arm64/include/asm/pgalloc.h
> > +++ b/arch/arm64/include/asm/pgalloc.h
> > @@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
> >  }
> >  #endif       /* CONFIG_PGTABLE_LEVELS > 3 */
> >
> > +#if CONFIG_PGTABLE_LEVELS > 4
> > +
> > +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
> > +{
> > +     if (pgtable_l5_enabled())
> > +             set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
> > +}
> > +
> > +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
> > +{
> > +     pgdval_t pgdval = PGD_TYPE_TABLE;
> > +
> > +     pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
> > +     __pgd_populate(pgdp, __pa(p4dp), pgdval);
> > +}
> > +
> > +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
> > +{
> > +     gfp_t gfp = GFP_PGTABLE_USER;
> > +
> > +     if (mm == &init_mm)
> > +             gfp = GFP_PGTABLE_KERNEL;
> > +     return (p4d_t *)get_zeroed_page(gfp);
> > +}
> > +
> > +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
> > +{
> > +     if (!pgtable_l5_enabled())
> > +             return;
> > +     BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
> > +     free_page((unsigned long)p4d);
> > +}
> > +
> > +#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
> > +#else
> > +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
> > +{
> > +     BUILD_BUG();
> > +}
> > +#endif       /* CONFIG_PGTABLE_LEVELS > 4 */
> > +
> >  extern pgd_t *pgd_alloc(struct mm_struct *mm);
> >  extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
> >
> > diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
> > index b91fe4781b06..b364b02e696b 100644
> > --- a/arch/arm64/include/asm/pgtable-hwdef.h
> > +++ b/arch/arm64/include/asm/pgtable-hwdef.h
> > @@ -26,10 +26,10 @@
> >  #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
> >
> >  /*
> > - * Size mapped by an entry at level n ( 0 <= n <= 3)
> > + * Size mapped by an entry at level n ( -1 <= n <= 3)
> >   * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
> >   * in the final page. The maximum number of translation levels supported by
> > - * the architecture is 4. Hence, starting at level n, we have further
> > + * the architecture is 5. Hence, starting at level n, we have further
> >   * ((4 - n) - 1) levels of translation excluding the offset within the page.
> >   * So, the total number of bits mapped by an entry at level n is :
> >   *
>
> Is it neccessary to represent the levels as (-1 - 3) in the kernel or are you
> open to switching to (0 - 4)?
>
> There are a couple of other places where translation level is used, which I
> found and fixed up for the KVM LPA2 support work. It got a bit messy to
> represent the levels using the architectural range (-1 - 3) so I ended up
> representing them as (0 - 4). The main issue was that KVM represents level as
> unsigned so that change would have looked quite big.
>
> Most of this is confined to KVM and the only place it really crosses over with
> the kernel is at __tlbi_level(). Which makes me think you might be missing some
> required changes (I didn't notice these in your other patches):
>
> Looking at the TLB management stuff, I think there are some places you will need
> to fix up to correctly handle the extra level in the kernel (e.g.
> tlb_get_level(), flush_tlb_range()).
>
> There are some new ecodings for level in the FSC field in the ESR. You might
> need to update the fault_info array in fault.c to represent these and correctly
> handle user space faults for the new level?
>

Hi Ryan,

Thanks for pointing this out. Once I have educated myself a bit more
about all of this, I should be able to answer your questions :-)

I did not do any user space testing in anger on this series, on the
assumption that we already support 52-bit VAs, but I completely missed
the fact that the additional level of paging requires additional
attention.

As for the level indexing: I have a slight preference for sticking
with the architectural range, but I don't deeply care either way.
Marc Zyngier Nov. 28, 2022, 6 p.m. UTC | #3
On 2022-11-28 16:22, Ard Biesheuvel wrote:
> On Mon, 28 Nov 2022 at 17:17, Ryan Roberts <ryan.roberts@arm.com> 
> wrote:
>> 
>> On 24/11/2022 12:39, Ard Biesheuvel wrote:
>> > Add the required types and descriptor accessors to support 5 levels of
>> > paging in the common code. This is one of the prerequisites for
>> > supporting 52-bit virtual addressing with 4k pages.
>> >
>> > Note that this does not cover the code that handles kernel mappings or
>> > the fixmap.
>> >
>> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>> > ---
>> >  arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
>> >  arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
>> >  arch/arm64/include/asm/pgtable-types.h |  6 ++
>> >  arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
>> >  arch/arm64/mm/mmu.c                    | 31 +++++++-
>> >  arch/arm64/mm/pgd.c                    | 15 +++-
>> >  6 files changed, 181 insertions(+), 9 deletions(-)
>> >
>> > diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
>> > index 237224484d0f..cae8c648f462 100644
>> > --- a/arch/arm64/include/asm/pgalloc.h
>> > +++ b/arch/arm64/include/asm/pgalloc.h
>> > @@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
>> >  }
>> >  #endif       /* CONFIG_PGTABLE_LEVELS > 3 */
>> >
>> > +#if CONFIG_PGTABLE_LEVELS > 4
>> > +
>> > +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
>> > +{
>> > +     if (pgtable_l5_enabled())
>> > +             set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
>> > +}
>> > +
>> > +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
>> > +{
>> > +     pgdval_t pgdval = PGD_TYPE_TABLE;
>> > +
>> > +     pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
>> > +     __pgd_populate(pgdp, __pa(p4dp), pgdval);
>> > +}
>> > +
>> > +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
>> > +{
>> > +     gfp_t gfp = GFP_PGTABLE_USER;
>> > +
>> > +     if (mm == &init_mm)
>> > +             gfp = GFP_PGTABLE_KERNEL;
>> > +     return (p4d_t *)get_zeroed_page(gfp);
>> > +}
>> > +
>> > +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
>> > +{
>> > +     if (!pgtable_l5_enabled())
>> > +             return;
>> > +     BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
>> > +     free_page((unsigned long)p4d);
>> > +}
>> > +
>> > +#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
>> > +#else
>> > +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
>> > +{
>> > +     BUILD_BUG();
>> > +}
>> > +#endif       /* CONFIG_PGTABLE_LEVELS > 4 */
>> > +
>> >  extern pgd_t *pgd_alloc(struct mm_struct *mm);
>> >  extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
>> >
>> > diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
>> > index b91fe4781b06..b364b02e696b 100644
>> > --- a/arch/arm64/include/asm/pgtable-hwdef.h
>> > +++ b/arch/arm64/include/asm/pgtable-hwdef.h
>> > @@ -26,10 +26,10 @@
>> >  #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
>> >
>> >  /*
>> > - * Size mapped by an entry at level n ( 0 <= n <= 3)
>> > + * Size mapped by an entry at level n ( -1 <= n <= 3)
>> >   * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
>> >   * in the final page. The maximum number of translation levels supported by
>> > - * the architecture is 4. Hence, starting at level n, we have further
>> > + * the architecture is 5. Hence, starting at level n, we have further
>> >   * ((4 - n) - 1) levels of translation excluding the offset within the page.
>> >   * So, the total number of bits mapped by an entry at level n is :
>> >   *
>> 
>> Is it neccessary to represent the levels as (-1 - 3) in the kernel or 
>> are you
>> open to switching to (0 - 4)?
>> 
>> There are a couple of other places where translation level is used, 
>> which I
>> found and fixed up for the KVM LPA2 support work. It got a bit messy 
>> to
>> represent the levels using the architectural range (-1 - 3) so I ended 
>> up
>> representing them as (0 - 4). The main issue was that KVM represents 
>> level as
>> unsigned so that change would have looked quite big.
>> 
>> Most of this is confined to KVM and the only place it really crosses 
>> over with
>> the kernel is at __tlbi_level(). Which makes me think you might be 
>> missing some
>> required changes (I didn't notice these in your other patches):
>> 
>> Looking at the TLB management stuff, I think there are some places you 
>> will need
>> to fix up to correctly handle the extra level in the kernel (e.g.
>> tlb_get_level(), flush_tlb_range()).
>> 
>> There are some new ecodings for level in the FSC field in the ESR. You 
>> might
>> need to update the fault_info array in fault.c to represent these and 
>> correctly
>> handle user space faults for the new level?
>> 
> 
> Hi Ryan,
> 
> Thanks for pointing this out. Once I have educated myself a bit more
> about all of this, I should be able to answer your questions :-)
> 
> I did not do any user space testing in anger on this series, on the
> assumption that we already support 52-bit VAs, but I completely missed
> the fact that the additional level of paging requires additional
> attention.
> 
> As for the level indexing: I have a slight preference for sticking
> with the architectural range, but I don't deeply care either way.

I'd really like to stick to the architectural representation, as
there is an ingrained knowledge of the relation between a base
granule size, a level, and a block mapping size.

The nice thing about level '-1' is that it preserve this behaviour,
and doesn't force everyone to adjust. It also makes it extremely
easy to compare the code and the spec.

So let's please stick to the [-1;3] range. It will save everyone
a lot of trouble.

Thanks,

         M.
Ryan Roberts Nov. 28, 2022, 6:20 p.m. UTC | #4
On 28/11/2022 18:00, Marc Zyngier wrote:
> On 2022-11-28 16:22, Ard Biesheuvel wrote:
>> On Mon, 28 Nov 2022 at 17:17, Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>
>>> On 24/11/2022 12:39, Ard Biesheuvel wrote:
>>> > Add the required types and descriptor accessors to support 5 levels of
>>> > paging in the common code. This is one of the prerequisites for
>>> > supporting 52-bit virtual addressing with 4k pages.
>>> >
>>> > Note that this does not cover the code that handles kernel mappings or
>>> > the fixmap.
>>> >
>>> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>>> > ---
>>> >  arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
>>> >  arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
>>> >  arch/arm64/include/asm/pgtable-types.h |  6 ++
>>> >  arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
>>> >  arch/arm64/mm/mmu.c                    | 31 +++++++-
>>> >  arch/arm64/mm/pgd.c                    | 15 +++-
>>> >  6 files changed, 181 insertions(+), 9 deletions(-)
>>> >
>>> > diff --git a/arch/arm64/include/asm/pgalloc.h
>>> b/arch/arm64/include/asm/pgalloc.h
>>> > index 237224484d0f..cae8c648f462 100644
>>> > --- a/arch/arm64/include/asm/pgalloc.h
>>> > +++ b/arch/arm64/include/asm/pgalloc.h
>>> > @@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp,
>>> phys_addr_t pudp, p4dval_t prot)
>>> >  }
>>> >  #endif       /* CONFIG_PGTABLE_LEVELS > 3 */
>>> >
>>> > +#if CONFIG_PGTABLE_LEVELS > 4
>>> > +
>>> > +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t
>>> prot)
>>> > +{
>>> > +     if (pgtable_l5_enabled())
>>> > +             set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
>>> > +}
>>> > +
>>> > +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t
>>> *p4dp)
>>> > +{
>>> > +     pgdval_t pgdval = PGD_TYPE_TABLE;
>>> > +
>>> > +     pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
>>> > +     __pgd_populate(pgdp, __pa(p4dp), pgdval);
>>> > +}
>>> > +
>>> > +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
>>> > +{
>>> > +     gfp_t gfp = GFP_PGTABLE_USER;
>>> > +
>>> > +     if (mm == &init_mm)
>>> > +             gfp = GFP_PGTABLE_KERNEL;
>>> > +     return (p4d_t *)get_zeroed_page(gfp);
>>> > +}
>>> > +
>>> > +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
>>> > +{
>>> > +     if (!pgtable_l5_enabled())
>>> > +             return;
>>> > +     BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
>>> > +     free_page((unsigned long)p4d);
>>> > +}
>>> > +
>>> > +#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
>>> > +#else
>>> > +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t
>>> prot)
>>> > +{
>>> > +     BUILD_BUG();
>>> > +}
>>> > +#endif       /* CONFIG_PGTABLE_LEVELS > 4 */
>>> > +
>>> >  extern pgd_t *pgd_alloc(struct mm_struct *mm);
>>> >  extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
>>> >
>>> > diff --git a/arch/arm64/include/asm/pgtable-hwdef.h
>>> b/arch/arm64/include/asm/pgtable-hwdef.h
>>> > index b91fe4781b06..b364b02e696b 100644
>>> > --- a/arch/arm64/include/asm/pgtable-hwdef.h
>>> > +++ b/arch/arm64/include/asm/pgtable-hwdef.h
>>> > @@ -26,10 +26,10 @@
>>> >  #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
>>> >
>>> >  /*
>>> > - * Size mapped by an entry at level n ( 0 <= n <= 3)
>>> > + * Size mapped by an entry at level n ( -1 <= n <= 3)
>>> >   * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
>>> >   * in the final page. The maximum number of translation levels supported by
>>> > - * the architecture is 4. Hence, starting at level n, we have further
>>> > + * the architecture is 5. Hence, starting at level n, we have further
>>> >   * ((4 - n) - 1) levels of translation excluding the offset within the page.
>>> >   * So, the total number of bits mapped by an entry at level n is :
>>> >   *
>>>
>>> Is it neccessary to represent the levels as (-1 - 3) in the kernel or are you
>>> open to switching to (0 - 4)?
>>>
>>> There are a couple of other places where translation level is used, which I
>>> found and fixed up for the KVM LPA2 support work. It got a bit messy to
>>> represent the levels using the architectural range (-1 - 3) so I ended up
>>> representing them as (0 - 4). The main issue was that KVM represents level as
>>> unsigned so that change would have looked quite big.
>>>
>>> Most of this is confined to KVM and the only place it really crosses over with
>>> the kernel is at __tlbi_level(). Which makes me think you might be missing some
>>> required changes (I didn't notice these in your other patches):
>>>
>>> Looking at the TLB management stuff, I think there are some places you will need
>>> to fix up to correctly handle the extra level in the kernel (e.g.
>>> tlb_get_level(), flush_tlb_range()).
>>>
>>> There are some new ecodings for level in the FSC field in the ESR. You might
>>> need to update the fault_info array in fault.c to represent these and correctly
>>> handle user space faults for the new level?
>>>
>>
>> Hi Ryan,
>>
>> Thanks for pointing this out. Once I have educated myself a bit more
>> about all of this, I should be able to answer your questions :-)
>>
>> I did not do any user space testing in anger on this series, on the
>> assumption that we already support 52-bit VAs, but I completely missed
>> the fact that the additional level of paging requires additional
>> attention.
>>
>> As for the level indexing: I have a slight preference for sticking
>> with the architectural range, but I don't deeply care either way.
> 
> I'd really like to stick to the architectural representation, as
> there is an ingrained knowledge of the relation between a base
> granule size, a level, and a block mapping size.
> 
> The nice thing about level '-1' is that it preserve this behaviour,
> and doesn't force everyone to adjust. It also makes it extremely
> easy to compare the code and the spec.
> 
> So let's please stick to the [-1;3] range. It will save everyone
> a lot of trouble.

Fair point. It will mean a bigger patch, but I'll rework my stuff to make it all
work with [-1;3] before I post it.

> 
> Thanks,
> 
>         M.
Ryan Roberts Nov. 29, 2022, 3:46 p.m. UTC | #5
On 28/11/2022 16:22, Ard Biesheuvel wrote:
> On Mon, 28 Nov 2022 at 17:17, Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 24/11/2022 12:39, Ard Biesheuvel wrote:
>>> Add the required types and descriptor accessors to support 5 levels of
>>> paging in the common code. This is one of the prerequisites for
>>> supporting 52-bit virtual addressing with 4k pages.
>>>
>>> Note that this does not cover the code that handles kernel mappings or
>>> the fixmap.
>>>
>>> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>>> ---
>>>  arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
>>>  arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
>>>  arch/arm64/include/asm/pgtable-types.h |  6 ++
>>>  arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
>>>  arch/arm64/mm/mmu.c                    | 31 +++++++-
>>>  arch/arm64/mm/pgd.c                    | 15 +++-
>>>  6 files changed, 181 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
>>> index 237224484d0f..cae8c648f462 100644
>>> --- a/arch/arm64/include/asm/pgalloc.h
>>> +++ b/arch/arm64/include/asm/pgalloc.h
>>> @@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
>>>  }
>>>  #endif       /* CONFIG_PGTABLE_LEVELS > 3 */
>>>
>>> +#if CONFIG_PGTABLE_LEVELS > 4
>>> +
>>> +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
>>> +{
>>> +     if (pgtable_l5_enabled())
>>> +             set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
>>> +}
>>> +
>>> +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
>>> +{
>>> +     pgdval_t pgdval = PGD_TYPE_TABLE;
>>> +
>>> +     pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
>>> +     __pgd_populate(pgdp, __pa(p4dp), pgdval);
>>> +}
>>> +
>>> +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
>>> +{
>>> +     gfp_t gfp = GFP_PGTABLE_USER;
>>> +
>>> +     if (mm == &init_mm)
>>> +             gfp = GFP_PGTABLE_KERNEL;
>>> +     return (p4d_t *)get_zeroed_page(gfp);
>>> +}
>>> +
>>> +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
>>> +{
>>> +     if (!pgtable_l5_enabled())
>>> +             return;
>>> +     BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
>>> +     free_page((unsigned long)p4d);
>>> +}
>>> +
>>> +#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
>>> +#else
>>> +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
>>> +{
>>> +     BUILD_BUG();
>>> +}
>>> +#endif       /* CONFIG_PGTABLE_LEVELS > 4 */
>>> +
>>>  extern pgd_t *pgd_alloc(struct mm_struct *mm);
>>>  extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
>>>
>>> diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
>>> index b91fe4781b06..b364b02e696b 100644
>>> --- a/arch/arm64/include/asm/pgtable-hwdef.h
>>> +++ b/arch/arm64/include/asm/pgtable-hwdef.h
>>> @@ -26,10 +26,10 @@
>>>  #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
>>>
>>>  /*
>>> - * Size mapped by an entry at level n ( 0 <= n <= 3)
>>> + * Size mapped by an entry at level n ( -1 <= n <= 3)
>>>   * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
>>>   * in the final page. The maximum number of translation levels supported by
>>> - * the architecture is 4. Hence, starting at level n, we have further
>>> + * the architecture is 5. Hence, starting at level n, we have further
>>>   * ((4 - n) - 1) levels of translation excluding the offset within the page.
>>>   * So, the total number of bits mapped by an entry at level n is :
>>>   *
>>
>> Is it neccessary to represent the levels as (-1 - 3) in the kernel or are you
>> open to switching to (0 - 4)?
>>
>> There are a couple of other places where translation level is used, which I
>> found and fixed up for the KVM LPA2 support work. It got a bit messy to
>> represent the levels using the architectural range (-1 - 3) so I ended up
>> representing them as (0 - 4). The main issue was that KVM represents level as
>> unsigned so that change would have looked quite big.
>>
>> Most of this is confined to KVM and the only place it really crosses over with
>> the kernel is at __tlbi_level(). Which makes me think you might be missing some
>> required changes (I didn't notice these in your other patches):
>>
>> Looking at the TLB management stuff, I think there are some places you will need
>> to fix up to correctly handle the extra level in the kernel (e.g.
>> tlb_get_level(), flush_tlb_range()).
>>
>> There are some new ecodings for level in the FSC field in the ESR. You might
>> need to update the fault_info array in fault.c to represent these and correctly
>> handle user space faults for the new level?
>>
> 
> Hi Ryan,
> 
> Thanks for pointing this out. Once I have educated myself a bit more
> about all of this, I should be able to answer your questions :-)

I've just noticed one more thing: get_user_mapping_size() in
arch/arm64/kvm/mmu.c uses CONFIG_PGTABLE_LEVELS to calculate the start level of
a user space page table. I guess that will need some attention now that the
runtime value might be smaller than this macro on systems that don't support LPA2?

> 
> I did not do any user space testing in anger on this series, on the
> assumption that we already support 52-bit VAs, but I completely missed
> the fact that the additional level of paging requires additional
> attention.
> 
> As for the level indexing: I have a slight preference for sticking
> with the architectural range, but I don't deeply care either way.
Ard Biesheuvel Nov. 29, 2022, 3:48 p.m. UTC | #6
On Tue, 29 Nov 2022 at 16:46, Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 28/11/2022 16:22, Ard Biesheuvel wrote:
> > On Mon, 28 Nov 2022 at 17:17, Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 24/11/2022 12:39, Ard Biesheuvel wrote:
> >>> Add the required types and descriptor accessors to support 5 levels of
> >>> paging in the common code. This is one of the prerequisites for
> >>> supporting 52-bit virtual addressing with 4k pages.
> >>>
> >>> Note that this does not cover the code that handles kernel mappings or
> >>> the fixmap.
> >>>
> >>> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> >>> ---
> >>>  arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++
> >>>  arch/arm64/include/asm/pgtable-hwdef.h | 22 +++++-
> >>>  arch/arm64/include/asm/pgtable-types.h |  6 ++
> >>>  arch/arm64/include/asm/pgtable.h       | 75 +++++++++++++++++++-
> >>>  arch/arm64/mm/mmu.c                    | 31 +++++++-
> >>>  arch/arm64/mm/pgd.c                    | 15 +++-
> >>>  6 files changed, 181 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> >>> index 237224484d0f..cae8c648f462 100644
> >>> --- a/arch/arm64/include/asm/pgalloc.h
> >>> +++ b/arch/arm64/include/asm/pgalloc.h
> >>> @@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
> >>>  }
> >>>  #endif       /* CONFIG_PGTABLE_LEVELS > 3 */
> >>>
> >>> +#if CONFIG_PGTABLE_LEVELS > 4
> >>> +
> >>> +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
> >>> +{
> >>> +     if (pgtable_l5_enabled())
> >>> +             set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
> >>> +}
> >>> +
> >>> +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
> >>> +{
> >>> +     pgdval_t pgdval = PGD_TYPE_TABLE;
> >>> +
> >>> +     pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
> >>> +     __pgd_populate(pgdp, __pa(p4dp), pgdval);
> >>> +}
> >>> +
> >>> +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
> >>> +{
> >>> +     gfp_t gfp = GFP_PGTABLE_USER;
> >>> +
> >>> +     if (mm == &init_mm)
> >>> +             gfp = GFP_PGTABLE_KERNEL;
> >>> +     return (p4d_t *)get_zeroed_page(gfp);
> >>> +}
> >>> +
> >>> +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
> >>> +{
> >>> +     if (!pgtable_l5_enabled())
> >>> +             return;
> >>> +     BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
> >>> +     free_page((unsigned long)p4d);
> >>> +}
> >>> +
> >>> +#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
> >>> +#else
> >>> +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
> >>> +{
> >>> +     BUILD_BUG();
> >>> +}
> >>> +#endif       /* CONFIG_PGTABLE_LEVELS > 4 */
> >>> +
> >>>  extern pgd_t *pgd_alloc(struct mm_struct *mm);
> >>>  extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
> >>>
> >>> diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
> >>> index b91fe4781b06..b364b02e696b 100644
> >>> --- a/arch/arm64/include/asm/pgtable-hwdef.h
> >>> +++ b/arch/arm64/include/asm/pgtable-hwdef.h
> >>> @@ -26,10 +26,10 @@
> >>>  #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
> >>>
> >>>  /*
> >>> - * Size mapped by an entry at level n ( 0 <= n <= 3)
> >>> + * Size mapped by an entry at level n ( -1 <= n <= 3)
> >>>   * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
> >>>   * in the final page. The maximum number of translation levels supported by
> >>> - * the architecture is 4. Hence, starting at level n, we have further
> >>> + * the architecture is 5. Hence, starting at level n, we have further
> >>>   * ((4 - n) - 1) levels of translation excluding the offset within the page.
> >>>   * So, the total number of bits mapped by an entry at level n is :
> >>>   *
> >>
> >> Is it neccessary to represent the levels as (-1 - 3) in the kernel or are you
> >> open to switching to (0 - 4)?
> >>
> >> There are a couple of other places where translation level is used, which I
> >> found and fixed up for the KVM LPA2 support work. It got a bit messy to
> >> represent the levels using the architectural range (-1 - 3) so I ended up
> >> representing them as (0 - 4). The main issue was that KVM represents level as
> >> unsigned so that change would have looked quite big.
> >>
> >> Most of this is confined to KVM and the only place it really crosses over with
> >> the kernel is at __tlbi_level(). Which makes me think you might be missing some
> >> required changes (I didn't notice these in your other patches):
> >>
> >> Looking at the TLB management stuff, I think there are some places you will need
> >> to fix up to correctly handle the extra level in the kernel (e.g.
> >> tlb_get_level(), flush_tlb_range()).
> >>
> >> There are some new ecodings for level in the FSC field in the ESR. You might
> >> need to update the fault_info array in fault.c to represent these and correctly
> >> handle user space faults for the new level?
> >>
> >
> > Hi Ryan,
> >
> > Thanks for pointing this out. Once I have educated myself a bit more
> > about all of this, I should be able to answer your questions :-)
>
> I've just noticed one more thing: get_user_mapping_size() in
> arch/arm64/kvm/mmu.c uses CONFIG_PGTABLE_LEVELS to calculate the start level of
> a user space page table. I guess that will need some attention now that the
> runtime value might be smaller than this macro on systems that don't support LPA2?

Indeed. In general, every reference to that quantity should now take
pgtable_l4_enabled() and pgtable_l5_enabled() into account as well.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 237224484d0f..cae8c648f462 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -60,6 +60,47 @@  static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 }
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
 
+#if CONFIG_PGTABLE_LEVELS > 4
+
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
+{
+	if (pgtable_l5_enabled())
+		set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
+{
+	pgdval_t pgdval = PGD_TYPE_TABLE;
+
+	pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
+	__pgd_populate(pgdp, __pa(p4dp), pgdval);
+}
+
+static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	gfp_t gfp = GFP_PGTABLE_USER;
+
+	if (mm == &init_mm)
+		gfp = GFP_PGTABLE_KERNEL;
+	return (p4d_t *)get_zeroed_page(gfp);
+}
+
+static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
+{
+	if (!pgtable_l5_enabled())
+		return;
+	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
+	free_page((unsigned long)p4d);
+}
+
+#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
+#else
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
+{
+	BUILD_BUG();
+}
+#endif	/* CONFIG_PGTABLE_LEVELS > 4 */
+
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
 
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index b91fe4781b06..b364b02e696b 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -26,10 +26,10 @@ 
 #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
 
 /*
- * Size mapped by an entry at level n ( 0 <= n <= 3)
+ * Size mapped by an entry at level n ( -1 <= n <= 3)
  * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
  * in the final page. The maximum number of translation levels supported by
- * the architecture is 4. Hence, starting at level n, we have further
+ * the architecture is 5. Hence, starting at level n, we have further
  * ((4 - n) - 1) levels of translation excluding the offset within the page.
  * So, the total number of bits mapped by an entry at level n is :
  *
@@ -62,9 +62,16 @@ 
 #define PTRS_PER_PUD		(1 << (PAGE_SHIFT - 3))
 #endif
 
+#if CONFIG_PGTABLE_LEVELS > 4
+#define P4D_SHIFT		ARM64_HW_PGTABLE_LEVEL_SHIFT(0)
+#define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK		(~(P4D_SIZE-1))
+#define PTRS_PER_P4D		(1 << (PAGE_SHIFT - 3))
+#endif
+
 /*
  * PGDIR_SHIFT determines the size a top-level page table entry can map
- * (depending on the configuration, this level can be 0, 1 or 2).
+ * (depending on the configuration, this level can be -1, 0, 1 or 2).
  */
 #define PGDIR_SHIFT		ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - CONFIG_PGTABLE_LEVELS)
 #define PGDIR_SIZE		(_AC(1, UL) << PGDIR_SHIFT)
@@ -87,6 +94,15 @@ 
 /*
  * Hardware page table definitions.
  *
+ * Level -1 descriptor (PGD).
+ */
+#define PGD_TYPE_TABLE		(_AT(pgdval_t, 3) << 0)
+#define PGD_TABLE_BIT		(_AT(pgdval_t, 1) << 1)
+#define PGD_TYPE_MASK		(_AT(pgdval_t, 3) << 0)
+#define PGD_TABLE_PXN		(_AT(pgdval_t, 1) << 59)
+#define PGD_TABLE_UXN		(_AT(pgdval_t, 1) << 60)
+
+/*
  * Level 0 descriptor (P4D).
  */
 #define P4D_TYPE_TABLE		(_AT(p4dval_t, 3) << 0)
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index b8f158ae2527..6d6d4065b0cb 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -36,6 +36,12 @@  typedef struct { pudval_t pud; } pud_t;
 #define __pud(x)	((pud_t) { (x) } )
 #endif
 
+#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
+#define p4d_val(x)	((x).p4d)
+#define __p4d(x)	((p4d_t) { (x) } )
+#endif
+
 typedef struct { pgdval_t pgd; } pgd_t;
 #define pgd_val(x)	((x).pgd)
 #define __pgd(x)	((pgd_t) { (x) } )
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 666db7173d0f..2f7202d03d98 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -793,7 +793,6 @@  static inline pud_t *p4d_pgtable(p4d_t p4d)
 #else
 
 #define p4d_page_paddr(p4d)	({ BUILD_BUG(); 0;})
-#define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
 
 /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
 #define pud_set_fixmap(addr)		NULL
@@ -804,6 +803,80 @@  static inline pud_t *p4d_pgtable(p4d_t p4d)
 
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
+#if CONFIG_PGTABLE_LEVELS > 4
+
+static __always_inline bool pgtable_l5_enabled(void)
+{
+	if (!alternative_has_feature_likely(ARM64_ALWAYS_BOOT))
+		return vabits_actual == VA_BITS;
+	return alternative_has_feature_unlikely(ARM64_HAS_LVA);
+}
+
+static inline bool mm_p4d_folded(struct mm_struct *mm)
+{
+	return !pgtable_l5_enabled();
+}
+#define mm_p4d_folded  mm_p4d_folded
+
+#define p4d_ERROR(e)	\
+	pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e))
+
+#define pgd_none(pgd)		(pgtable_l5_enabled() && !pgd_val(pgd))
+#define pgd_bad(pgd)		(pgtable_l5_enabled() && !(pgd_val(pgd) & 2))
+#define pgd_present(pgd)	(!pgd_none(pgd))
+
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	if (in_swapper_pgdir(pgdp)) {
+		set_swapper_pgd(pgdp, __pgd(pgd_val(pgd)));
+		return;
+	}
+
+	WRITE_ONCE(*pgdp, pgd);
+	dsb(ishst);
+	isb();
+}
+
+static inline void pgd_clear(pgd_t *pgdp)
+{
+	if (pgtable_l5_enabled())
+		set_pgd(pgdp, __pgd(0));
+}
+
+static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
+{
+	return __pgd_to_phys(pgd);
+}
+
+#define p4d_index(addr)		(((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))
+
+static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr)
+{
+	return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr);
+}
+
+static inline phys_addr_t p4d_offset_phys(pgd_t *pgdp, unsigned long addr)
+{
+	BUG_ON(!pgtable_l5_enabled());
+
+	return pgd_page_paddr(READ_ONCE(*pgdp)) + p4d_index(addr) * sizeof(p4d_t);
+}
+
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr)
+{
+	if (!pgtable_l5_enabled())
+		return pgd_to_folded_p4d(pgdp, addr);
+	return (p4d_t *)__va(p4d_offset_phys(pgdp, addr));
+}
+
+#define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))
+
+#else
+
+static inline bool pgtable_l5_enabled(void) { return false; }
+
+#endif  /* CONFIG_PGTABLE_LEVELS > 4 */
+
 #define pgd_ERROR(e)	\
 	pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index bcf617f956cb..d089bc78e592 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1049,7 +1049,7 @@  static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
 	if (CONFIG_PGTABLE_LEVELS <= 3)
 		return;
 
-	if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+	if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK))
 		return;
 
 	/*
@@ -1072,8 +1072,8 @@  static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
 				 unsigned long end, unsigned long floor,
 				 unsigned long ceiling)
 {
-	unsigned long next;
 	p4d_t *p4dp, p4d;
+	unsigned long i, next, start = addr;
 
 	do {
 		next = p4d_addr_end(addr, end);
@@ -1085,6 +1085,27 @@  static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
 		WARN_ON(!p4d_present(p4d));
 		free_empty_pud_table(p4dp, addr, next, floor, ceiling);
 	} while (addr = next, addr < end);
+
+	if (!pgtable_l5_enabled())
+		return;
+
+	if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+		return;
+
+	/*
+	 * Check whether we can free the p4d page if the rest of the
+	 * entries are empty. Overlap with other regions have been
+	 * handled by the floor/ceiling check.
+	 */
+	p4dp = p4d_offset(pgdp, 0UL);
+	for (i = 0; i < PTRS_PER_P4D; i++) {
+		if (!p4d_none(READ_ONCE(p4dp[i])))
+			return;
+	}
+
+	pgd_clear(pgdp);
+	__flush_tlb_kernel_pgtable(start);
+	free_hotplug_pgtable_page(virt_to_page(p4dp));
 }
 
 static void free_empty_tables(unsigned long addr, unsigned long end,
@@ -1351,6 +1372,12 @@  int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 	return 1;
 }
 
+#ifndef __PAGETABLE_P4D_FOLDED
+void p4d_clear_huge(p4d_t *p4dp)
+{
+}
+#endif
+
 int pud_clear_huge(pud_t *pudp)
 {
 	if (!pud_sect(READ_ONCE(*pudp)))
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 4a64089e5771..3c4f8a279d2b 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -17,11 +17,20 @@ 
 
 static struct kmem_cache *pgd_cache __ro_after_init;
 
+static bool pgdir_is_page_size(void)
+{
+	if (PGD_SIZE == PAGE_SIZE)
+		return true;
+	if (CONFIG_PGTABLE_LEVELS == 5)
+		return !pgtable_l5_enabled();
+	return false;
+}
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	gfp_t gfp = GFP_PGTABLE_USER;
 
-	if (PGD_SIZE == PAGE_SIZE)
+	if (pgdir_is_page_size())
 		return (pgd_t *)__get_free_page(gfp);
 	else
 		return kmem_cache_alloc(pgd_cache, gfp);
@@ -29,7 +38,7 @@  pgd_t *pgd_alloc(struct mm_struct *mm)
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	if (PGD_SIZE == PAGE_SIZE)
+	if (pgdir_is_page_size())
 		free_page((unsigned long)pgd);
 	else
 		kmem_cache_free(pgd_cache, pgd);
@@ -37,7 +46,7 @@  void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 void __init pgtable_cache_init(void)
 {
-	if (PGD_SIZE == PAGE_SIZE)
+	if (pgdir_is_page_size())
 		return;
 
 #ifdef CONFIG_ARM64_PA_BITS_52