diff mbox series

[v2,07/13] powerpc: add support for folded p4d page tables

Message ID 20200216081843.28670-8-rppt@kernel.org
State New, archived
Headers show
Series mm: remove __ARCH_HAS_5LEVEL_HACK | expand

Commit Message

Mike Rapoport Feb. 16, 2020, 8:18 a.m. UTC
From: Mike Rapoport <rppt@linux.ibm.com>

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
---
 arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
 arch/powerpc/include/asm/book3s/64/hash.h     |  4 +-
 arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 58 ++++++++++--------
 arch/powerpc/include/asm/book3s/64/radix.h    |  6 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
 arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
 .../include/asm/nohash/64/pgtable-4k.h        | 32 +++++-----
 arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
 arch/powerpc/include/asm/pgtable.h            |  8 +++
 arch/powerpc/kvm/book3s_64_mmu_radix.c        | 59 ++++++++++++++++---
 arch/powerpc/lib/code-patching.c              |  7 ++-
 arch/powerpc/mm/book3s32/mmu.c                |  2 +-
 arch/powerpc/mm/book3s32/tlb.c                |  4 +-
 arch/powerpc/mm/book3s64/hash_pgtable.c       |  4 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c      | 19 ++++--
 arch/powerpc/mm/book3s64/subpage_prot.c       |  6 +-
 arch/powerpc/mm/hugetlbpage.c                 | 28 +++++----
 arch/powerpc/mm/kasan/kasan_init_32.c         |  8 +--
 arch/powerpc/mm/mem.c                         |  4 +-
 arch/powerpc/mm/nohash/40x.c                  |  4 +-
 arch/powerpc/mm/nohash/book3e_pgtable.c       | 15 +++--
 arch/powerpc/mm/pgtable.c                     | 25 +++++++-
 arch/powerpc/mm/pgtable_32.c                  | 28 +++++----
 arch/powerpc/mm/pgtable_64.c                  | 10 ++--
 arch/powerpc/mm/ptdump/hashpagetable.c        | 20 ++++++-
 arch/powerpc/mm/ptdump/ptdump.c               | 22 ++++++-
 arch/powerpc/xmon/xmon.c                      | 17 +++++-
 28 files changed, 284 insertions(+), 120 deletions(-)

Comments

Christophe Leroy Feb. 16, 2020, 10:41 a.m. UTC | #1
Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> Implement primitives necessary for the 4th level folding, add walks of p4d
> level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.

I don't think it is worth adding all this additionnals walks of p4d, 
this patch could be limited to changes like:

-		pud = pud_offset(pgd, gpa);
+		pud = pud_offset(p4d_offset(pgd, gpa), gpa);

The additionnal walks should be added through another patch the day 
powerpc need them.

See below for more comments.

> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
> ---
>   arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
>   arch/powerpc/include/asm/book3s/64/hash.h     |  4 +-
>   arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
>   arch/powerpc/include/asm/book3s/64/pgtable.h  | 58 ++++++++++--------
>   arch/powerpc/include/asm/book3s/64/radix.h    |  6 +-
>   arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
>   arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
>   .../include/asm/nohash/64/pgtable-4k.h        | 32 +++++-----
>   arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
>   arch/powerpc/include/asm/pgtable.h            |  8 +++
>   arch/powerpc/kvm/book3s_64_mmu_radix.c        | 59 ++++++++++++++++---
>   arch/powerpc/lib/code-patching.c              |  7 ++-
>   arch/powerpc/mm/book3s32/mmu.c                |  2 +-
>   arch/powerpc/mm/book3s32/tlb.c                |  4 +-
>   arch/powerpc/mm/book3s64/hash_pgtable.c       |  4 +-
>   arch/powerpc/mm/book3s64/radix_pgtable.c      | 19 ++++--
>   arch/powerpc/mm/book3s64/subpage_prot.c       |  6 +-
>   arch/powerpc/mm/hugetlbpage.c                 | 28 +++++----
>   arch/powerpc/mm/kasan/kasan_init_32.c         |  8 +--
>   arch/powerpc/mm/mem.c                         |  4 +-
>   arch/powerpc/mm/nohash/40x.c                  |  4 +-
>   arch/powerpc/mm/nohash/book3e_pgtable.c       | 15 +++--
>   arch/powerpc/mm/pgtable.c                     | 25 +++++++-
>   arch/powerpc/mm/pgtable_32.c                  | 28 +++++----
>   arch/powerpc/mm/pgtable_64.c                  | 10 ++--
>   arch/powerpc/mm/ptdump/hashpagetable.c        | 20 ++++++-
>   arch/powerpc/mm/ptdump/ptdump.c               | 22 ++++++-
>   arch/powerpc/xmon/xmon.c                      | 17 +++++-
>   28 files changed, 284 insertions(+), 120 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 5b39c11e884a..39ec11371be0 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -2,7 +2,6 @@
>   #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
>   #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
>   
> -#define __ARCH_USE_5LEVEL_HACK
>   #include <asm-generic/pgtable-nopmd.h>
>   
>   #include <asm/book3s/32/hash.h>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> index 2781ebf6add4..876d1528c2cf 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea)
>   
>   #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
>   #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)
> -static inline int hash__pgd_bad(pgd_t pgd)
> +static inline int hash__p4d_bad(p4d_t p4d)
>   {
> -	return (pgd_val(pgd) == 0);
> +	return (p4d_val(p4d) == 0);
>   }
>   #ifdef CONFIG_STRICT_KERNEL_RWX
>   extern void hash__mark_rodata_ro(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> index a41e91bd0580..69c5b051734f 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> @@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
>   	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
>   }
>   
> -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
>   {
> -	*pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
> +	*pgd =  __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
>   }
>   
>   static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 201a69e6a355..ddddbafff0ab 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -2,7 +2,7 @@
>   #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
>   #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
>   
> -#include <asm-generic/5level-fixup.h>
> +#include <asm-generic/pgtable-nop4d.h>
>   
>   #ifndef __ASSEMBLY__
>   #include <linux/mmdebug.h>
> @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift;
>   /* Bits to mask out from a PUD to get to the PMD page */
>   #define PUD_MASKED_BITS		0xc0000000000000ffUL
>   /* Bits to mask out from a PGD to get to the PUD page */
> -#define PGD_MASKED_BITS		0xc0000000000000ffUL
> +#define P4D_MASKED_BITS		0xc0000000000000ffUL
>   
>   /*
>    * Used as an indicator for rcu callback functions
> @@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
>   	return pte_access_permitted(pud_pte(pud), write);
>   }
>   
> -#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
> +#define __p4d_raw(x)	((p4d_t) { __pgd_raw(x) })
> +static inline __be64 p4d_raw(p4d_t x)
> +{
> +	return pgd_raw(x.pgd);
> +}
> +

Shouldn't this be defined in asm/pgtable-be-types.h, just like other 
__pxx_raw() ?

> +#define p4d_write(p4d)		pte_write(p4d_pte(p4d))
>   
> -static inline void pgd_clear(pgd_t *pgdp)
> +static inline void p4d_clear(p4d_t *p4dp)
>   {
> -	*pgdp = __pgd(0);
> +	*p4dp = __p4d(0);
>   }
>   
> -static inline int pgd_none(pgd_t pgd)
> +static inline int p4d_none(p4d_t p4d)
>   {
> -	return !pgd_raw(pgd);
> +	return !p4d_raw(p4d);
>   }
>   
> -static inline int pgd_present(pgd_t pgd)
> +static inline int p4d_present(p4d_t p4d)
>   {
> -	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
> +	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT));
>   }
>   
> -static inline pte_t pgd_pte(pgd_t pgd)
> +static inline pte_t p4d_pte(p4d_t p4d)
>   {
> -	return __pte_raw(pgd_raw(pgd));
> +	return __pte_raw(p4d_raw(p4d));
>   }
>   
> -static inline pgd_t pte_pgd(pte_t pte)
> +static inline p4d_t pte_p4d(pte_t pte)
>   {
> -	return __pgd_raw(pte_raw(pte));
> +	return __p4d_raw(pte_raw(pte));
>   }
>   
> -static inline int pgd_bad(pgd_t pgd)
> +static inline int p4d_bad(p4d_t p4d)
>   {
>   	if (radix_enabled())
> -		return radix__pgd_bad(pgd);
> -	return hash__pgd_bad(pgd);
> +		return radix__p4d_bad(p4d);
> +	return hash__p4d_bad(p4d);
>   }
>   
> -#define pgd_access_permitted pgd_access_permitted
> -static inline bool pgd_access_permitted(pgd_t pgd, bool write)
> +#define p4d_access_permitted p4d_access_permitted
> +static inline bool p4d_access_permitted(p4d_t p4d, bool write)
>   {
> -	return pte_access_permitted(pgd_pte(pgd), write);
> +	return pte_access_permitted(p4d_pte(p4d), write);
>   }
>   
> -extern struct page *pgd_page(pgd_t pgd);
> +extern struct page *p4d_page(p4d_t p4d);
>   
>   /* Pointers in the page table tree are physical addresses */
>   #define __pgtable_ptr_val(ptr)	__pa(ptr)
>   
>   #define pmd_page_vaddr(pmd)	__va(pmd_val(pmd) & ~PMD_MASKED_BITS)
>   #define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
> -#define pgd_page_vaddr(pgd)	__va(pgd_val(pgd) & ~PGD_MASKED_BITS)
> +#define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
>   
>   #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
>   #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
> @@ -1010,8 +1016,8 @@ extern struct page *pgd_page(pgd_t pgd);
>   
>   #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
>   
> -#define pud_offset(pgdp, addr)	\
> -	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
> +#define pud_offset(p4dp, addr)	\
> +	(((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr))
>   #define pmd_offset(pudp,addr) \
>   	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
>   #define pte_offset_kernel(dir,addr) \
> @@ -1368,6 +1374,12 @@ static inline bool pud_is_leaf(pud_t pud)
>   	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
>   }
>   
> +#define p4d_is_leaf p4d_is_leaf
> +static inline bool p4d_is_leaf(p4d_t p4d)
> +{
> +	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE));
> +}
> +
>   #define pgd_is_leaf pgd_is_leaf
>   #define pgd_leaf pgd_is_leaf
>   static inline bool pgd_is_leaf(pgd_t pgd)

[...]

> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 8cc543ed114c..0a05fddd7881 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -139,6 +139,14 @@ static inline bool pud_is_leaf(pud_t pud)
>   }
>   #endif
>   
> +#ifndef p4d_is_leaf
> +#define p4d_is_leaf p4d_is_leaf
> +static inline bool p4d_is_leaf(p4d_t p4d)
> +{
> +	return false;
> +}
> +#endif
> +
>   #ifndef pgd_is_leaf
>   #define pgd_is_leaf pgd_is_leaf
>   static inline bool pgd_is_leaf(pgd_t pgd)
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 803940d79b73..5aacfa0b27ef 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -494,17 +494,39 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
>   	pud_free(kvm->mm, pud);
>   }
>   
> +static void kvmppc_unmap_free_p4d(struct kvm *kvm, p4d_t *p4d,
> +				  unsigned int lpid)
> +{
> +	unsigned long iu;
> +	p4d_t *p = p4d;
> +
> +	for (iu = 0; iu < PTRS_PER_P4D; ++iu, ++p) {
> +		if (!p4d_present(*p))
> +			continue;
> +		if (p4d_is_leaf(*p)) {
> +			p4d_clear(p);
> +		} else {
> +			pud_t *pud;
> +
> +			pud = pud_offset(p, 0);
> +			kvmppc_unmap_free_pud(kvm, pud, lpid);
> +			p4d_clear(p);
> +		}
> +	}
> +	p4d_free(kvm->mm, p4d);
> +}
> +
>   void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
>   {
>   	unsigned long ig;
>   
>   	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
> -		pud_t *pud;
> +		p4d_t *p4d;
>   
>   		if (!pgd_present(*pgd))
>   			continue;
> -		pud = pud_offset(pgd, 0);
> -		kvmppc_unmap_free_pud(kvm, pud, lpid);
> +		p4d = p4d_offset(pgd, 0);
> +		kvmppc_unmap_free_p4d(kvm, p4d, lpid);
>   		pgd_clear(pgd);
>   	}
>   }
> @@ -566,6 +588,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>   		      unsigned long *rmapp, struct rmap_nested **n_rmap)
>   {
>   	pgd_t *pgd;
> +	p4d_t *p4d, *new_p4d = NULL;
>   	pud_t *pud, *new_pud = NULL;
>   	pmd_t *pmd, *new_pmd = NULL;
>   	pte_t *ptep, *new_ptep = NULL;
> @@ -573,9 +596,15 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>   
>   	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
>   	pgd = pgtable + pgd_index(gpa);
> -	pud = NULL;
> +	p4d = NULL;
>   	if (pgd_present(*pgd))
> -		pud = pud_offset(pgd, gpa);
> +		p4d = p4d_offset(pgd, gpa);
> +	else
> +		new_p4d = p4d_alloc_one(kvm->mm, gpa);
> +
> +	pud = NULL;
> +	if (p4d_present(*p4d))
> +		pud = pud_offset(p4d, gpa);

Is it worth adding all this new code ?

My understanding is that the series objective is to get rid of 
__ARCH_HAS_5LEVEL_HACK, to to add support for 5 levels to an 
architecture that not need it (at least for now).
If we want to add support for 5 levels, it can be done later in another 
patch.

Here I think your change could be limited to:

-		pud = pud_offset(pgd, gpa);
+		pud = pud_offset(p4d_offset(pgd, gpa), gpa);


>   	else
>   		new_pud = pud_alloc_one(kvm->mm, gpa);
>   
> @@ -597,12 +626,18 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>   	/* Now traverse again under the lock and change the tree */
>   	ret = -ENOMEM;
>   	if (pgd_none(*pgd)) {
> +		if (!new_p4d)
> +			goto out_unlock;
> +		pgd_populate(kvm->mm, pgd, new_p4d);
> +		new_p4d = NULL;
> +	}
> +	if (p4d_none(*p4d)) {
>   		if (!new_pud)
>   			goto out_unlock;
> -		pgd_populate(kvm->mm, pgd, new_pud);
> +		p4d_populate(kvm->mm, p4d, new_pud);
>   		new_pud = NULL;
>   	}
> -	pud = pud_offset(pgd, gpa);
> +	pud = pud_offset(p4d, gpa);
>   	if (pud_is_leaf(*pud)) {
>   		unsigned long hgpa = gpa & PUD_MASK;
>   
> @@ -1220,6 +1255,7 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
>   	pgd_t *pgt;
>   	struct kvm_nested_guest *nested;
>   	pgd_t pgd, *pgdp;
> +	p4d_t p4d, *p4dp;
>   	pud_t pud, *pudp;
>   	pmd_t pmd, *pmdp;
>   	pte_t *ptep;
> @@ -1298,7 +1334,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
>   			continue;
>   		}
>   
> -		pudp = pud_offset(&pgd, gpa);
> +		p4dp = p4d_offset(&pgd, gpa);
> +		p4d = READ_ONCE(*p4dp);
> +		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
> +			gpa = (gpa & P4D_MASK) + P4D_SIZE;
> +			continue;
> +		}
> +
> +		pudp = pud_offset(&p4d, gpa);

Same, here you are forcing a useless read with READ_ONCE().

Your change could be limited to

-		pudp = pud_offset(&pgd, gpa);
+		pudp = pud_offset(p4d_offset(&pgd, gpa), gpa);

This comment applies to many other places.


>   		pud = READ_ONCE(*pudp);
>   		if (!(pud_val(pud) & _PAGE_PRESENT)) {
>   			gpa = (gpa & PUD_MASK) + PUD_SIZE;
> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> index 3345f039a876..7a59f6863cec 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr)
>   	pte_t *ptep;
>   	pmd_t *pmdp;
>   	pud_t *pudp;
> +	p4d_t *p4dp;
>   	pgd_t *pgdp;
>   
>   	pgdp = pgd_offset_k(addr);
>   	if (unlikely(!pgdp))
>   		return -EINVAL;
>   
> -	pudp = pud_offset(pgdp, addr);
> +	p4dp = p4d_offset(pgdp, addr);
> +	if (unlikely(!p4dp))
> +		return -EINVAL;
> +
> +	pudp = pud_offset(p4dp, addr);
>   	if (unlikely(!pudp))
>   		return -EINVAL;
>   
> diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> index 0a1c65a2c565..b2fc3e71165c 100644
> --- a/arch/powerpc/mm/book3s32/mmu.c
> +++ b/arch/powerpc/mm/book3s32/mmu.c
> @@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
>   
>   	if (!Hash)
>   		return;
> -	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);

If we continue like this, in ten years this like is going to be many 
kilometers long.

I think the above would be worth a generic helper.

>   	if (!pmd_none(*pmd))
>   		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
>   }
> diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
> index 2fcd321040ff..175bc33b41b7 100644
> --- a/arch/powerpc/mm/book3s32/tlb.c
> +++ b/arch/powerpc/mm/book3s32/tlb.c
> @@ -87,7 +87,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
>   	if (start >= end)
>   		return;
>   	end = (end - 1) | ~PAGE_MASK;
> -	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start);
>   	for (;;) {
>   		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
>   		if (pmd_end > end)
> @@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
>   		return;
>   	}
>   	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
> -	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr);
>   	if (!pmd_none(*pmd))
>   		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
>   }
> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> index 64733b9cb20a..9cd15937e88a 100644
> --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
>   int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   {
>   	pgd_t *pgdp;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
> @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
>   	if (slab_is_available()) {
>   		pgdp = pgd_offset_k(ea);
> -		pudp = pud_alloc(&init_mm, pgdp, ea);
> +		p4dp = p4d_offset(pgdp, ea);
> +		pudp = pud_alloc(&init_mm, p4dp, ea);

Could be a single line, without a new var.

-		pudp = pud_alloc(&init_mm, pgdp, ea);
+		pudp = pud_alloc(&init_mm, p4d_offset(pgdp, ea), ea);


Same kind of comments as already done apply to the rest.

Christophe
Mike Rapoport Feb. 18, 2020, 10:54 a.m. UTC | #2
On Sun, Feb 16, 2020 at 11:41:07AM +0100, Christophe Leroy wrote:
> 
> 
> Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > Implement primitives necessary for the 4th level folding, add walks of p4d
> > level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
> 
> I don't think it is worth adding all this additionnals walks of p4d, this
> patch could be limited to changes like:
> 
> -		pud = pud_offset(pgd, gpa);
> +		pud = pud_offset(p4d_offset(pgd, gpa), gpa);
> 
> The additionnal walks should be added through another patch the day powerpc
> need them.

Ok, I'll update the patch to reduce walking the p4d.
 
> See below for more comments.
> 
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
> > ---

...

> > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > index 201a69e6a355..ddddbafff0ab 100644
> > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > @@ -2,7 +2,7 @@
> >   #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
> >   #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
> > -#include <asm-generic/5level-fixup.h>
> > +#include <asm-generic/pgtable-nop4d.h>
> >   #ifndef __ASSEMBLY__
> >   #include <linux/mmdebug.h>
> > @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift;
> >   /* Bits to mask out from a PUD to get to the PMD page */
> >   #define PUD_MASKED_BITS		0xc0000000000000ffUL
> >   /* Bits to mask out from a PGD to get to the PUD page */
> > -#define PGD_MASKED_BITS		0xc0000000000000ffUL
> > +#define P4D_MASKED_BITS		0xc0000000000000ffUL
> >   /*
> >    * Used as an indicator for rcu callback functions
> > @@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
> >   	return pte_access_permitted(pud_pte(pud), write);
> >   }
> > -#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
> > +#define __p4d_raw(x)	((p4d_t) { __pgd_raw(x) })
> > +static inline __be64 p4d_raw(p4d_t x)
> > +{
> > +	return pgd_raw(x.pgd);
> > +}
> > +
> 
> Shouldn't this be defined in asm/pgtable-be-types.h, just like other
> __pxx_raw() ?

Ideally yes, but this creates weird header file dependencies and untangling
them would generate way too much churn.
 
> > +#define p4d_write(p4d)		pte_write(p4d_pte(p4d))
> > -static inline void pgd_clear(pgd_t *pgdp)
> > +static inline void p4d_clear(p4d_t *p4dp)
> >   {
> > -	*pgdp = __pgd(0);
> > +	*p4dp = __p4d(0);
> >   }

...

> > @@ -573,9 +596,15 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> >   	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
> >   	pgd = pgtable + pgd_index(gpa);
> > -	pud = NULL;
> > +	p4d = NULL;
> >   	if (pgd_present(*pgd))
> > -		pud = pud_offset(pgd, gpa);
> > +		p4d = p4d_offset(pgd, gpa);
> > +	else
> > +		new_p4d = p4d_alloc_one(kvm->mm, gpa);
> > +
> > +	pud = NULL;
> > +	if (p4d_present(*p4d))
> > +		pud = pud_offset(p4d, gpa);
> 
> Is it worth adding all this new code ?
> 
> My understanding is that the series objective is to get rid of
> __ARCH_HAS_5LEVEL_HACK, to to add support for 5 levels to an architecture
> that not need it (at least for now).
> If we want to add support for 5 levels, it can be done later in another
> patch.
> 
> Here I think your change could be limited to:
> 
> -		pud = pud_offset(pgd, gpa);
> +		pud = pud_offset(p4d_offset(pgd, gpa), gpa);

This won't work. Without __ARCH_USE_5LEVEL_HACK defined pgd_present() is
hardwired to 1 and the actual check for the top level is performed with
p4d_present(). The 'else' clause that allocates p4d will never be taken and
it could be removed, but I prefer to keep it for consistency.
 
> >   	else
> >   		new_pud = pud_alloc_one(kvm->mm, gpa);
> > @@ -597,12 +626,18 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> >   	/* Now traverse again under the lock and change the tree */
> >   	ret = -ENOMEM;
> >   	if (pgd_none(*pgd)) {
> > +		if (!new_p4d)
> > +			goto out_unlock;
> > +		pgd_populate(kvm->mm, pgd, new_p4d);
> > +		new_p4d = NULL;
> > +	}
> > +	if (p4d_none(*p4d)) {
> >   		if (!new_pud)
> >   			goto out_unlock;
> > -		pgd_populate(kvm->mm, pgd, new_pud);
> > +		p4d_populate(kvm->mm, p4d, new_pud);
> >   		new_pud = NULL;
> >   	}
> > -	pud = pud_offset(pgd, gpa);
> > +	pud = pud_offset(p4d, gpa);
> >   	if (pud_is_leaf(*pud)) {
> >   		unsigned long hgpa = gpa & PUD_MASK;
> > @@ -1220,6 +1255,7 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
> >   	pgd_t *pgt;
> >   	struct kvm_nested_guest *nested;
> >   	pgd_t pgd, *pgdp;
> > +	p4d_t p4d, *p4dp;
> >   	pud_t pud, *pudp;
> >   	pmd_t pmd, *pmdp;
> >   	pte_t *ptep;
> > @@ -1298,7 +1334,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
> >   			continue;
> >   		}
> > -		pudp = pud_offset(&pgd, gpa);
> > +		p4dp = p4d_offset(&pgd, gpa);
> > +		p4d = READ_ONCE(*p4dp);
> > +		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
> > +			gpa = (gpa & P4D_MASK) + P4D_SIZE;
> > +			continue;
> > +		}
> > +
> > +		pudp = pud_offset(&p4d, gpa);
> 
> Same, here you are forcing a useless read with READ_ONCE().
> 
> Your change could be limited to
> 
> -		pudp = pud_offset(&pgd, gpa);
> +		pudp = pud_offset(p4d_offset(&pgd, gpa), gpa);

Here again the actual check must be done against p4d rather than pgd. We
could skip READ_ONCE() for pgd, but since it is a debugfs method I don't
think it is more important than code consistency.
 
> This comment applies to many other places.

I'll make another pass to see where we can take the shortcut and use 

	pudp = pud_offset(p4d_offset(...))
 
> >   		pud = READ_ONCE(*pudp);
> >   		if (!(pud_val(pud) & _PAGE_PRESENT)) {
> >   			gpa = (gpa & PUD_MASK) + PUD_SIZE;
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 3345f039a876..7a59f6863cec 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr)
> >   	pte_t *ptep;
> >   	pmd_t *pmdp;
> >   	pud_t *pudp;
> > +	p4d_t *p4dp;
> >   	pgd_t *pgdp;
> >   	pgdp = pgd_offset_k(addr);
> >   	if (unlikely(!pgdp))
> >   		return -EINVAL;
> > -	pudp = pud_offset(pgdp, addr);
> > +	p4dp = p4d_offset(pgdp, addr);
> > +	if (unlikely(!p4dp))
> > +		return -EINVAL;
> > +
> > +	pudp = pud_offset(p4dp, addr);
> >   	if (unlikely(!pudp))
> >   		return -EINVAL;
> > diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> > index 0a1c65a2c565..b2fc3e71165c 100644
> > --- a/arch/powerpc/mm/book3s32/mmu.c
> > +++ b/arch/powerpc/mm/book3s32/mmu.c
> > @@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
> >   	if (!Hash)
> >   		return;
> > -	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);
> 
> If we continue like this, in ten years this like is going to be many
> kilometers long.
> 
> I think the above would be worth a generic helper.

Agree. My plan was to first unify all the architectures and then start
introducing the generic helpers, like e.g. pmd_offset_mm().
 
> >   	if (!pmd_none(*pmd))
> >   		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
> >   }
> > diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
> > index 2fcd321040ff..175bc33b41b7 100644
> > --- a/arch/powerpc/mm/book3s32/tlb.c
> > +++ b/arch/powerpc/mm/book3s32/tlb.c
> > @@ -87,7 +87,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
> >   	if (start >= end)
> >   		return;
> >   	end = (end - 1) | ~PAGE_MASK;
> > -	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start);
> >   	for (;;) {
> >   		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
> >   		if (pmd_end > end)
> > @@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
> >   		return;
> >   	}
> >   	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
> > -	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr);
> >   	if (!pmd_none(*pmd))
> >   		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
> >   }
> > diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> > index 64733b9cb20a..9cd15937e88a 100644
> > --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> > +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> > @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
> >   int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   {
> >   	pgd_t *pgdp;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> > @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
> >   	if (slab_is_available()) {
> >   		pgdp = pgd_offset_k(ea);
> > -		pudp = pud_alloc(&init_mm, pgdp, ea);
> > +		p4dp = p4d_offset(pgdp, ea);
> > +		pudp = pud_alloc(&init_mm, p4dp, ea);
> 
> Could be a single line, without a new var.
> 
> -		pudp = pud_alloc(&init_mm, pgdp, ea);
> +		pudp = pud_alloc(&init_mm, p4d_offset(pgdp, ea), ea);
> 
> 
> Same kind of comments as already done apply to the rest.
> 
> Christophe
Christophe Leroy Feb. 19, 2020, 12:07 p.m. UTC | #3
Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> Implement primitives necessary for the 4th level folding, add walks of p4d
> level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
> ---
>   arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
>   arch/powerpc/include/asm/book3s/64/hash.h     |  4 +-
>   arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
>   arch/powerpc/include/asm/book3s/64/pgtable.h  | 58 ++++++++++--------
>   arch/powerpc/include/asm/book3s/64/radix.h    |  6 +-
>   arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
>   arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
>   .../include/asm/nohash/64/pgtable-4k.h        | 32 +++++-----
>   arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
>   arch/powerpc/include/asm/pgtable.h            |  8 +++
>   arch/powerpc/kvm/book3s_64_mmu_radix.c        | 59 ++++++++++++++++---
>   arch/powerpc/lib/code-patching.c              |  7 ++-
>   arch/powerpc/mm/book3s32/mmu.c                |  2 +-
>   arch/powerpc/mm/book3s32/tlb.c                |  4 +-
>   arch/powerpc/mm/book3s64/hash_pgtable.c       |  4 +-
>   arch/powerpc/mm/book3s64/radix_pgtable.c      | 19 ++++--
>   arch/powerpc/mm/book3s64/subpage_prot.c       |  6 +-
>   arch/powerpc/mm/hugetlbpage.c                 | 28 +++++----
>   arch/powerpc/mm/kasan/kasan_init_32.c         |  8 +--
>   arch/powerpc/mm/mem.c                         |  4 +-
>   arch/powerpc/mm/nohash/40x.c                  |  4 +-
>   arch/powerpc/mm/nohash/book3e_pgtable.c       | 15 +++--
>   arch/powerpc/mm/pgtable.c                     | 25 +++++++-
>   arch/powerpc/mm/pgtable_32.c                  | 28 +++++----
>   arch/powerpc/mm/pgtable_64.c                  | 10 ++--
>   arch/powerpc/mm/ptdump/hashpagetable.c        | 20 ++++++-
>   arch/powerpc/mm/ptdump/ptdump.c               | 22 ++++++-
>   arch/powerpc/xmon/xmon.c                      | 17 +++++-
>   28 files changed, 284 insertions(+), 120 deletions(-)
> 
> diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
> index 206156255247..7bd4b81d5b5d 100644
> --- a/arch/powerpc/mm/ptdump/ptdump.c
> +++ b/arch/powerpc/mm/ptdump/ptdump.c
> @@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
>   	}
>   }
>   
> -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
>   {
> -	pud_t *pud = pud_offset(pgd, 0);
> +	pud_t *pud = pud_offset(p4d, 0);
>   	unsigned long addr;
>   	unsigned int i;
>   
> @@ -293,6 +293,22 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
>   	}
>   }
>   
> +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +	p4d_t *p4d = p4d_offset(pgd, 0);
> +	unsigned long addr;
> +	unsigned int i;
> +
> +	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
> +		addr = start + i * P4D_SIZE;
> +		if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
> +			/* p4d exists */
> +			walk_pud(st, p4d, addr);
> +		else
> +			note_page(st, addr, 2, p4d_val(*p4d));

Level 2 is already used by walk_pud().

I think you have to increment the level used in walk_pud() and 
walk_pmd() and walk_pte()

> +	}
> +}
> +
>   static void walk_pagetables(struct pg_state *st)
>   {
>   	unsigned int i;
> @@ -306,7 +322,7 @@ static void walk_pagetables(struct pg_state *st)
>   	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
>   		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
>   			/* pgd exists */
> -			walk_pud(st, pgd, addr);
> +			walk_p4d(st, pgd, addr);
>   		else
>   			note_page(st, addr, 1, pgd_val(*pgd));
>   	}

Christophe
Mike Rapoport Feb. 19, 2020, 1:24 p.m. UTC | #4
On Wed, Feb 19, 2020 at 01:07:55PM +0100, Christophe Leroy wrote:
> 
> Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
> > diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
> > index 206156255247..7bd4b81d5b5d 100644
> > --- a/arch/powerpc/mm/ptdump/ptdump.c
> > +++ b/arch/powerpc/mm/ptdump/ptdump.c
> > @@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
> >   	}
> >   }
> > -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> > +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
> >   {
> > -	pud_t *pud = pud_offset(pgd, 0);
> > +	pud_t *pud = pud_offset(p4d, 0);
> >   	unsigned long addr;
> >   	unsigned int i;
> > @@ -293,6 +293,22 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> >   	}
> >   }
> > +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
> > +{
> > +	p4d_t *p4d = p4d_offset(pgd, 0);
> > +	unsigned long addr;
> > +	unsigned int i;
> > +
> > +	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
> > +		addr = start + i * P4D_SIZE;
> > +		if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
> > +			/* p4d exists */
> > +			walk_pud(st, p4d, addr);
> > +		else
> > +			note_page(st, addr, 2, p4d_val(*p4d));
> 
> Level 2 is already used by walk_pud().
> 
> I think you have to increment the level used in walk_pud() and walk_pmd()
> and walk_pte()

Thanks for catching this!
I'll fix the numbers in the next version.
 
> > +	}
> > +}
> > +
> >   static void walk_pagetables(struct pg_state *st)
> >   {
> >   	unsigned int i;
> > @@ -306,7 +322,7 @@ static void walk_pagetables(struct pg_state *st)
> >   	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
> >   		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
> >   			/* pgd exists */
> > -			walk_pud(st, pgd, addr);
> > +			walk_p4d(st, pgd, addr);
> >   		else
> >   			note_page(st, addr, 1, pgd_val(*pgd));
> >   	}
> 
> Christophe
Mike Rapoport Feb. 26, 2020, 9:13 a.m. UTC | #5
On Tue, Feb 18, 2020 at 12:54:40PM +0200, Mike Rapoport wrote:
> On Sun, Feb 16, 2020 at 11:41:07AM +0100, Christophe Leroy wrote:
> > 
> > 
> > Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
> > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > 
> > > Implement primitives necessary for the 4th level folding, add walks of p4d
> > > level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
> > 
> > I don't think it is worth adding all this additionnals walks of p4d, this
> > patch could be limited to changes like:
> > 
> > -		pud = pud_offset(pgd, gpa);
> > +		pud = pud_offset(p4d_offset(pgd, gpa), gpa);
> > 
> > The additionnal walks should be added through another patch the day powerpc
> > need them.
> 
> Ok, I'll update the patch to reduce walking the p4d.

Here's what I have with more direct acceses from pgd to pud.

From 6c59a86ce8394fb6100e9b6ced2e346981fb0ce9 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Sun, 24 Nov 2019 15:38:00 +0200
Subject: [PATCH v3] powerpc: add support for folded p4d page tables

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
---
v3:
* reduce amount of added p4d walks
* kill pgtable_32::get_pteptr and traverse page table in
  pgtable_32::__change_page_attr_noflush


 arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
 arch/powerpc/include/asm/book3s/64/hash.h     |  4 +-
 arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 60 ++++++++++---------
 arch/powerpc/include/asm/book3s/64/radix.h    |  6 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
 arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
 .../include/asm/nohash/64/pgtable-4k.h        | 32 +++++-----
 arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
 arch/powerpc/include/asm/pgtable.h            |  6 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c        | 30 ++++++----
 arch/powerpc/lib/code-patching.c              |  7 ++-
 arch/powerpc/mm/book3s32/mmu.c                |  2 +-
 arch/powerpc/mm/book3s32/tlb.c                |  4 +-
 arch/powerpc/mm/book3s64/hash_pgtable.c       |  4 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c      | 26 +++++---
 arch/powerpc/mm/book3s64/subpage_prot.c       |  6 +-
 arch/powerpc/mm/hugetlbpage.c                 | 28 +++++----
 arch/powerpc/mm/kasan/kasan_init_32.c         |  8 +--
 arch/powerpc/mm/mem.c                         |  4 +-
 arch/powerpc/mm/nohash/40x.c                  |  4 +-
 arch/powerpc/mm/nohash/book3e_pgtable.c       | 15 ++---
 arch/powerpc/mm/pgtable.c                     | 30 ++++++----
 arch/powerpc/mm/pgtable_32.c                  | 45 +++-----------
 arch/powerpc/mm/pgtable_64.c                  | 10 ++--
 arch/powerpc/mm/ptdump/hashpagetable.c        | 20 ++++++-
 arch/powerpc/mm/ptdump/ptdump.c               | 14 +++--
 arch/powerpc/xmon/xmon.c                      | 18 +++---
 28 files changed, 213 insertions(+), 184 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 5b39c11e884a..39ec11371be0 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -2,7 +2,6 @@
 #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
 #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nopmd.h>
 
 #include <asm/book3s/32/hash.h>
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..876d1528c2cf 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea)
 
 #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
 #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)
-static inline int hash__pgd_bad(pgd_t pgd)
+static inline int hash__p4d_bad(p4d_t p4d)
 {
-	return (pgd_val(pgd) == 0);
+	return (p4d_val(p4d) == 0);
 }
 #ifdef CONFIG_STRICT_KERNEL_RWX
 extern void hash__mark_rodata_ro(void);
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index a41e91bd0580..69c5b051734f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
 {
-	*pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
+	*pgd =  __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 201a69e6a355..fa60e8594b9f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
 #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
 
-#include <asm-generic/5level-fixup.h>
+#include <asm-generic/pgtable-nop4d.h>
 
 #ifndef __ASSEMBLY__
 #include <linux/mmdebug.h>
@@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift;
 /* Bits to mask out from a PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0xc0000000000000ffUL
 /* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS		0xc0000000000000ffUL
+#define P4D_MASKED_BITS		0xc0000000000000ffUL
 
 /*
  * Used as an indicator for rcu callback functions
@@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
 	return pte_access_permitted(pud_pte(pud), write);
 }
 
-#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
+#define __p4d_raw(x)	((p4d_t) { __pgd_raw(x) })
+static inline __be64 p4d_raw(p4d_t x)
+{
+	return pgd_raw(x.pgd);
+}
+
+#define p4d_write(p4d)		pte_write(p4d_pte(p4d))
 
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
 {
-	*pgdp = __pgd(0);
+	*p4dp = __p4d(0);
 }
 
-static inline int pgd_none(pgd_t pgd)
+static inline int p4d_none(p4d_t p4d)
 {
-	return !pgd_raw(pgd);
+	return !p4d_raw(p4d);
 }
 
-static inline int pgd_present(pgd_t pgd)
+static inline int p4d_present(p4d_t p4d)
 {
-	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
+	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT));
 }
 
-static inline pte_t pgd_pte(pgd_t pgd)
+static inline pte_t p4d_pte(p4d_t p4d)
 {
-	return __pte_raw(pgd_raw(pgd));
+	return __pte_raw(p4d_raw(p4d));
 }
 
-static inline pgd_t pte_pgd(pte_t pte)
+static inline p4d_t pte_p4d(pte_t pte)
 {
-	return __pgd_raw(pte_raw(pte));
+	return __p4d_raw(pte_raw(pte));
 }
 
-static inline int pgd_bad(pgd_t pgd)
+static inline int p4d_bad(p4d_t p4d)
 {
 	if (radix_enabled())
-		return radix__pgd_bad(pgd);
-	return hash__pgd_bad(pgd);
+		return radix__p4d_bad(p4d);
+	return hash__p4d_bad(p4d);
 }
 
-#define pgd_access_permitted pgd_access_permitted
-static inline bool pgd_access_permitted(pgd_t pgd, bool write)
+#define p4d_access_permitted p4d_access_permitted
+static inline bool p4d_access_permitted(p4d_t p4d, bool write)
 {
-	return pte_access_permitted(pgd_pte(pgd), write);
+	return pte_access_permitted(p4d_pte(p4d), write);
 }
 
-extern struct page *pgd_page(pgd_t pgd);
+extern struct page *p4d_page(p4d_t p4d);
 
 /* Pointers in the page table tree are physical addresses */
 #define __pgtable_ptr_val(ptr)	__pa(ptr)
 
 #define pmd_page_vaddr(pmd)	__va(pmd_val(pmd) & ~PMD_MASKED_BITS)
 #define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
-#define pgd_page_vaddr(pgd)	__va(pgd_val(pgd) & ~PGD_MASKED_BITS)
+#define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
 
 #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
 #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
@@ -1010,8 +1016,8 @@ extern struct page *pgd_page(pgd_t pgd);
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
-#define pud_offset(pgdp, addr)	\
-	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
+#define pud_offset(p4dp, addr)	\
+	(((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr))
 #define pmd_offset(pudp,addr) \
 	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
 #define pte_offset_kernel(dir,addr) \
@@ -1368,11 +1374,11 @@ static inline bool pud_is_leaf(pud_t pud)
 	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
 }
 
-#define pgd_is_leaf pgd_is_leaf
-#define pgd_leaf pgd_is_leaf
-static inline bool pgd_is_leaf(pgd_t pgd)
+#define p4d_is_leaf p4d_is_leaf
+#define p4d_leaf p4d_is_leaf
+static inline bool p4d_is_leaf(p4d_t p4d)
 {
-	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
+	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE));
 }
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..9bca2ac64220 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -30,7 +30,7 @@
 /* Don't have anything in the reserved bits and leaf bits */
 #define RADIX_PMD_BAD_BITS		0x60000000000000e0UL
 #define RADIX_PUD_BAD_BITS		0x60000000000000e0UL
-#define RADIX_PGD_BAD_BITS		0x60000000000000e0UL
+#define RADIX_P4D_BAD_BITS		0x60000000000000e0UL
 
 #define RADIX_PMD_SHIFT		(PAGE_SHIFT + RADIX_PTE_INDEX_SIZE)
 #define RADIX_PUD_SHIFT		(RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE)
@@ -227,9 +227,9 @@ static inline int radix__pud_bad(pud_t pud)
 }
 
 
-static inline int radix__pgd_bad(pgd_t pgd)
+static inline int radix__p4d_bad(p4d_t p4d)
 {
-	return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
+	return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 60c4d829152e..d4c2c4259fa3 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -2,7 +2,6 @@
 #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H
 #define _ASM_POWERPC_NOHASH_32_PGTABLE_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nopmd.h>
 
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index b9534a793293..668aee6017e7 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -15,7 +15,7 @@ struct vmemmap_backing {
 };
 extern struct vmemmap_backing *vmemmap_list;
 
-#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
+#define p4d_populate(MM, P4D, PUD)	p4d_set(P4D, (unsigned long)PUD)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
index c40ec32b8194..81b1c54e3cf1 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
 #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
 
-#include <asm-generic/5level-fixup.h>
+#include <asm-generic/pgtable-nop4d.h>
 
 /*
  * Entries per page directory level.  The PTE level must use a 64b record
@@ -45,41 +45,41 @@
 #define PMD_MASKED_BITS		0
 /* Bits to mask out from a PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0
-/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS		0
+/* Bits to mask out from a P4D to get to the PUD page */
+#define P4D_MASKED_BITS		0
 
 
 /*
  * 4-level page tables related bits
  */
 
-#define pgd_none(pgd)		(!pgd_val(pgd))
-#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
-#define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
+#define p4d_none(p4d)		(!p4d_val(p4d))
+#define p4d_bad(p4d)		(p4d_val(p4d) == 0)
+#define p4d_present(p4d)	(p4d_val(p4d) != 0)
+#define p4d_page_vaddr(p4d)	(p4d_val(p4d) & ~P4D_MASKED_BITS)
 
 #ifndef __ASSEMBLY__
 
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
 {
-	*pgdp = __pgd(0);
+	*p4dp = __p4d(0);
 }
 
-static inline pte_t pgd_pte(pgd_t pgd)
+static inline pte_t p4d_pte(p4d_t p4d)
 {
-	return __pte(pgd_val(pgd));
+	return __pte(p4d_val(p4d));
 }
 
-static inline pgd_t pte_pgd(pte_t pte)
+static inline p4d_t pte_p4d(pte_t pte)
 {
-	return __pgd(pte_val(pte));
+	return __p4d(pte_val(pte));
 }
-extern struct page *pgd_page(pgd_t pgd);
+extern struct page *p4d_page(p4d_t p4d);
 
 #endif /* !__ASSEMBLY__ */
 
-#define pud_offset(pgdp, addr)	\
-  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
+#define pud_offset(p4dp, addr)	\
+  (((pud_t *) p4d_page_vaddr(*(p4dp))) + \
     (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
 
 #define pud_ERROR(e) \
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 9a33b8bd842d..b360f262b9c6 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -175,11 +175,11 @@ static inline pud_t pte_pud(pte_t pte)
 	return __pud(pte_val(pte));
 }
 #define pud_write(pud)		pte_write(pud_pte(pud))
-#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
+#define p4d_write(pgd)		pte_write(p4d_pte(p4d))
 
-static inline void pgd_set(pgd_t *pgdp, unsigned long val)
+static inline void p4d_set(p4d_t *p4dp, unsigned long val)
 {
-	*pgdp = __pgd(val);
+	*p4dp = __p4d(val);
 }
 
 /*
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 8cc543ed114c..05205d7a7b4a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -139,9 +139,9 @@ static inline bool pud_is_leaf(pud_t pud)
 }
 #endif
 
-#ifndef pgd_is_leaf
-#define pgd_is_leaf pgd_is_leaf
-static inline bool pgd_is_leaf(pgd_t pgd)
+#ifndef p4d_is_leaf
+#define p4d_is_leaf p4d_is_leaf
+static inline bool p4d_is_leaf(p4d_t p4d)
 {
 	return false;
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 803940d79b73..beb694285100 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -499,13 +499,14 @@ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 	unsigned long ig;
 
 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
+		p4d_t *p4d = p4d_offset(pgd, 0);
 		pud_t *pud;
 
-		if (!pgd_present(*pgd))
+		if (!p4d_present(*p4d))
 			continue;
-		pud = pud_offset(pgd, 0);
+		pud = pud_offset(p4d, 0);
 		kvmppc_unmap_free_pud(kvm, pud, lpid);
-		pgd_clear(pgd);
+		p4d_clear(p4d);
 	}
 }
 
@@ -566,6 +567,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 		      unsigned long *rmapp, struct rmap_nested **n_rmap)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud, *new_pud = NULL;
 	pmd_t *pmd, *new_pmd = NULL;
 	pte_t *ptep, *new_ptep = NULL;
@@ -573,9 +575,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 
 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
 	pgd = pgtable + pgd_index(gpa);
+	p4d = p4d_offset(pgd, gpa);
+
 	pud = NULL;
-	if (pgd_present(*pgd))
-		pud = pud_offset(pgd, gpa);
+	if (p4d_present(*p4d))
+		pud = pud_offset(p4d, gpa);
 	else
 		new_pud = pud_alloc_one(kvm->mm, gpa);
 
@@ -596,13 +600,13 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 
 	/* Now traverse again under the lock and change the tree */
 	ret = -ENOMEM;
-	if (pgd_none(*pgd)) {
+	if (p4d_none(*p4d)) {
 		if (!new_pud)
 			goto out_unlock;
-		pgd_populate(kvm->mm, pgd, new_pud);
+		p4d_populate(kvm->mm, p4d, new_pud);
 		new_pud = NULL;
 	}
-	pud = pud_offset(pgd, gpa);
+	pud = pud_offset(p4d, gpa);
 	if (pud_is_leaf(*pud)) {
 		unsigned long hgpa = gpa & PUD_MASK;
 
@@ -1220,6 +1224,7 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
 	pgd_t *pgt;
 	struct kvm_nested_guest *nested;
 	pgd_t pgd, *pgdp;
+	p4d_t p4d, *p4dp;
 	pud_t pud, *pudp;
 	pmd_t pmd, *pmdp;
 	pte_t *ptep;
@@ -1292,13 +1297,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
 		}
 
 		pgdp = pgt + pgd_index(gpa);
-		pgd = READ_ONCE(*pgdp);
-		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
-			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
+		p4dp = p4d_offset(pgdp, gpa);
+		p4d = READ_ONCE(*p4dp);
+		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
+			gpa = (gpa & P4D_MASK) + P4D_SIZE;
 			continue;
 		}
 
-		pudp = pud_offset(&pgd, gpa);
+		pudp = pud_offset(&p4d, gpa);
 		pud = READ_ONCE(*pudp);
 		if (!(pud_val(pud) & _PAGE_PRESENT)) {
 			gpa = (gpa & PUD_MASK) + PUD_SIZE;
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 3345f039a876..7a59f6863cec 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr)
 	pte_t *ptep;
 	pmd_t *pmdp;
 	pud_t *pudp;
+	p4d_t *p4dp;
 	pgd_t *pgdp;
 
 	pgdp = pgd_offset_k(addr);
 	if (unlikely(!pgdp))
 		return -EINVAL;
 
-	pudp = pud_offset(pgdp, addr);
+	p4dp = p4d_offset(pgdp, addr);
+	if (unlikely(!p4dp))
+		return -EINVAL;
+
+	pudp = pud_offset(p4dp, addr);
 	if (unlikely(!pudp))
 		return -EINVAL;
 
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index f888cbb109b9..edef17c97206 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
 
 	if (!Hash)
 		return;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);
 	if (!pmd_none(*pmd))
 		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
 }
diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
index 2fcd321040ff..175bc33b41b7 100644
--- a/arch/powerpc/mm/book3s32/tlb.c
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -87,7 +87,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
 	if (start >= end)
 		return;
 	end = (end - 1) | ~PAGE_MASK;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start);
 	for (;;) {
 		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
 		if (pmd_end > end)
@@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 		return;
 	}
 	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr);
 	if (!pmd_none(*pmd))
 		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
 }
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 64733b9cb20a..9cd15937e88a 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
 int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 {
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
 	if (slab_is_available()) {
 		pgdp = pgd_offset_k(ea);
-		pudp = pud_alloc(&init_mm, pgdp, ea);
+		p4dp = p4d_offset(pgdp, ea);
+		pudp = pud_alloc(&init_mm, p4dp, ea);
 		if (!pudp)
 			return -ENOMEM;
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index dd1bea45325c..fc3d0b0460b0 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -64,17 +64,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
 {
 	unsigned long pfn = pa >> PAGE_SHIFT;
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
 
 	pgdp = pgd_offset_k(ea);
-	if (pgd_none(*pgdp)) {
+	p4dp = p4d_offset(pgdp, ea);
+	if (p4d_none(*p4dp)) {
 		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
 						region_start, region_end);
-		pgd_populate(&init_mm, pgdp, pudp);
+		p4d_populate(&init_mm, p4dp, pudp);
 	}
-	pudp = pud_offset(pgdp, ea);
+	pudp = pud_offset(p4dp, ea);
 	if (map_page_size == PUD_SIZE) {
 		ptep = (pte_t *)pudp;
 		goto set_the_pte;
@@ -114,6 +116,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
 {
 	unsigned long pfn = pa >> PAGE_SHIFT;
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -136,7 +139,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
 	 * boot.
 	 */
 	pgdp = pgd_offset_k(ea);
-	pudp = pud_alloc(&init_mm, pgdp, ea);
+	p4dp = p4d_offset(pgdp, ea);
+	pudp = pud_alloc(&init_mm, p4dp, ea);
 	if (!pudp)
 		return -ENOMEM;
 	if (map_page_size == PUD_SIZE) {
@@ -173,6 +177,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
 {
 	unsigned long idx;
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -185,7 +190,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
 
 	for (idx = start; idx < end; idx += PAGE_SIZE) {
 		pgdp = pgd_offset_k(idx);
-		pudp = pud_alloc(&init_mm, pgdp, idx);
+		p4dp = p4d_offset(pgdp, idx);
+		pudp = pud_alloc(&init_mm, p4dp, idx);
 		if (!pudp)
 			continue;
 		if (pud_is_leaf(*pudp)) {
@@ -847,6 +853,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 	unsigned long addr, next;
 	pud_t *pud_base;
 	pgd_t *pgd;
+	p4d_t *p4d;
 
 	spin_lock(&init_mm.page_table_lock);
 
@@ -854,15 +861,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 		next = pgd_addr_end(addr, end);
 
 		pgd = pgd_offset_k(addr);
-		if (!pgd_present(*pgd))
+		p4d = p4d_offset(pgd, addr);
+		if (!p4d_present(*p4d))
 			continue;
 
-		if (pgd_is_leaf(*pgd)) {
-			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
+		if (p4d_is_leaf(*p4d)) {
+			split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d);
 			continue;
 		}
 
-		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+		pud_base = (pud_t *)p4d_page_vaddr(*p4d);
 		remove_pud_table(pud_base, addr, next);
 	}
 
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 2ef24a53f4c9..25a0c044bd93 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	spinlock_t *ptl;
 
 	pgd = pgd_offset(mm, addr);
-	if (pgd_none(*pgd))
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
 		return;
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	if (pud_none(*pud))
 		return;
 	pmd = pmd_offset(pud, addr);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 33b3461d91e8..54f5994d4cbb 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -119,6 +119,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
+	p4d_t *p4;
 	pud_t *pu;
 	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
@@ -128,20 +129,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 
 	addr &= ~(sz-1);
 	pg = pgd_offset(mm, addr);
+	p4 = p4d_offset(pg, addr);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (pshift == PGDIR_SHIFT)
 		/* 16GB huge page */
-		return (pte_t *) pg;
+		return (pte_t *) p4;
 	else if (pshift > PUD_SHIFT) {
 		/*
 		 * We need to use hugepd table
 		 */
 		ptl = &mm->page_table_lock;
-		hpdp = (hugepd_t *)pg;
+		hpdp = (hugepd_t *)p4;
 	} else {
 		pdshift = PUD_SHIFT;
-		pu = pud_alloc(mm, pg, addr);
+		pu = pud_alloc(mm, p4, addr);
 		if (!pu)
 			return NULL;
 		if (pshift == PUD_SHIFT)
@@ -166,10 +168,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 #else
 	if (pshift >= PGDIR_SHIFT) {
 		ptl = &mm->page_table_lock;
-		hpdp = (hugepd_t *)pg;
+		hpdp = (hugepd_t *)p4;
 	} else {
 		pdshift = PUD_SHIFT;
-		pu = pud_alloc(mm, pg, addr);
+		pu = pud_alloc(mm, p4, addr);
 		if (!pu)
 			return NULL;
 		if (pshift >= PUD_SHIFT) {
@@ -390,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	mm_dec_nr_pmds(tlb->mm);
 }
 
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 				   unsigned long addr, unsigned long end,
 				   unsigned long floor, unsigned long ceiling)
 {
@@ -400,7 +402,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 
 	start = addr;
 	do {
-		pud = pud_offset(pgd, addr);
+		pud = pud_offset(p4d, addr);
 		next = pud_addr_end(addr, end);
 		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
 			if (pud_none_or_clear_bad(pud))
@@ -435,8 +437,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	if (end - 1 > ceiling - 1)
 		return;
 
-	pud = pud_offset(pgd, start);
-	pgd_clear(pgd);
+	pud = pud_offset(p4d, start);
+	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
 	mm_dec_nr_puds(tlb->mm);
 }
@@ -449,6 +451,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long floor, unsigned long ceiling)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	unsigned long next;
 
 	/*
@@ -471,10 +474,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	do {
 		next = pgd_addr_end(addr, end);
 		pgd = pgd_offset(tlb->mm, addr);
+		p4d = p4d_offset(pgd, addr);
 		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
-			if (pgd_none_or_clear_bad(pgd))
+			if (p4d_none_or_clear_bad(p4d))
 				continue;
-			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+			hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
 		} else {
 			unsigned long more;
 			/*
@@ -487,7 +491,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			if (more > next)
 				next = more;
 
-			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+			free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
 					  addr, next, floor, ceiling);
 		}
 	} while (addr = next, addr != end);
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index db5664dde5ff..88e2e16380b5 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -36,7 +36,7 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned
 	unsigned long k_cur, k_next;
 	pte_t *new = NULL;
 
-	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_start), k_start), k_start), k_start);
 
 	for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
 		k_next = pgd_addr_end(k_cur, k_end);
@@ -78,7 +78,7 @@ static int __init kasan_init_region(void *start, size_t size)
 	block = memblock_alloc(k_end - k_start, PAGE_SIZE);
 
 	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
-		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
 		void *va = block + k_cur - k_start;
 		pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
 
@@ -102,7 +102,7 @@ static void __init kasan_remap_early_shadow_ro(void)
 	kasan_populate_pte(kasan_early_shadow_pte, prot);
 
 	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
-		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
 		pte_t *ptep = pte_offset_kernel(pmd, k_cur);
 
 		if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
@@ -201,7 +201,7 @@ void __init kasan_early_init(void)
 	unsigned long addr = KASAN_SHADOW_START;
 	unsigned long end = KASAN_SHADOW_END;
 	unsigned long next;
-	pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
+	pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(addr), addr), addr), addr);
 
 	BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ef7b1119b2e2..8262b384dcf3 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -69,8 +69,8 @@ EXPORT_SYMBOL(kmap_prot);
 
 static inline pte_t *virt_to_kpte(unsigned long vaddr)
 {
-	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
-			vaddr), vaddr), vaddr);
+	return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr), vaddr);
 }
 #endif
 
diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
index f348104eb461..7aaf7155e350 100644
--- a/arch/powerpc/mm/nohash/40x.c
+++ b/arch/powerpc/mm/nohash/40x.c
@@ -104,7 +104,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
 
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
 		*pmdp++ = __pmd(val);
 		*pmdp++ = __pmd(val);
 		*pmdp++ = __pmd(val);
@@ -119,7 +119,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
 
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
 		*pmdp = __pmd(val);
 
 		v += LARGE_PAGE_SIZE_4M;
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 4637fdd469cf..77884e24281d 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size)
 int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 {
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 	BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
 	if (slab_is_available()) {
 		pgdp = pgd_offset_k(ea);
-		pudp = pud_alloc(&init_mm, pgdp, ea);
+		p4dp = p4d_offset(pgdp, ea);
+		pudp = pud_alloc(&init_mm, p4dp, ea);
 		if (!pudp)
 			return -ENOMEM;
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
@@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 			return -ENOMEM;
 	} else {
 		pgdp = pgd_offset_k(ea);
-#ifndef __PAGETABLE_PUD_FOLDED
-		if (pgd_none(*pgdp)) {
-			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
-			pgd_populate(&init_mm, pgdp, pudp);
+		p4dp = p4d_offset(pgdp, ea);
+		if (p4d_none(*p4dp)) {
+			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+			p4d_populate(&init_mm, p4dp, pmdp);
 		}
-#endif /* !__PAGETABLE_PUD_FOLDED */
-		pudp = pud_offset(pgdp, ea);
+		pudp = pud_offset(p4dp, ea);
 		if (pud_none(*pudp)) {
 			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
 			pud_populate(&init_mm, pudp, pmdp);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index e3759b69f81b..c2499271f6c1 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -265,6 +265,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
@@ -272,7 +273,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 		return;
 	pgd = mm->pgd + pgd_index(addr);
 	BUG_ON(pgd_none(*pgd));
-	pud = pud_offset(pgd, addr);
+	p4d = p4d_offset(pgd, addr);
+	BUG_ON(p4d_none(*p4d));
+	pud = pud_offset(p4d, addr);
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, addr);
 	/*
@@ -312,12 +315,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys);
 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 			bool *is_thp, unsigned *hpage_shift)
 {
-	pgd_t pgd, *pgdp;
+	pgd_t *pgdp;
+	p4d_t p4d, *p4dp;
 	pud_t pud, *pudp;
 	pmd_t pmd, *pmdp;
 	pte_t *ret_pte;
 	hugepd_t *hpdp = NULL;
-	unsigned pdshift = PGDIR_SHIFT;
+	unsigned pdshift;
 
 	if (hpage_shift)
 		*hpage_shift = 0;
@@ -325,24 +329,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 	if (is_thp)
 		*is_thp = false;
 
-	pgdp = pgdir + pgd_index(ea);
-	pgd  = READ_ONCE(*pgdp);
 	/*
 	 * Always operate on the local stack value. This make sure the
 	 * value don't get updated by a parallel THP split/collapse,
 	 * page fault or a page unmap. The return pte_t * is still not
 	 * stable. So should be checked there for above conditions.
+	 * Top level is an exception because it is folded into p4d.
 	 */
-	if (pgd_none(pgd))
+	pgdp = pgdir + pgd_index(ea);
+	p4dp = p4d_offset(pgdp, ea);
+	p4d  = READ_ONCE(*p4dp);
+	pdshift = P4D_SHIFT;
+
+	if (p4d_none(p4d))
 		return NULL;
 
-	if (pgd_is_leaf(pgd)) {
-		ret_pte = (pte_t *)pgdp;
+	if (p4d_is_leaf(p4d)) {
+		ret_pte = (pte_t *)p4dp;
 		goto out;
 	}
 
-	if (is_hugepd(__hugepd(pgd_val(pgd)))) {
-		hpdp = (hugepd_t *)&pgd;
+	if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+		hpdp = (hugepd_t *)&p4d;
 		goto out_huge;
 	}
 
@@ -352,7 +360,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 	 * irq disabled
 	 */
 	pdshift = PUD_SHIFT;
-	pudp = pud_offset(&pgd, ea);
+	pudp = pud_offset(&p4d, ea);
 	pud  = READ_ONCE(*pudp);
 
 	if (pud_none(pud))
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5fb90edd865e..5774d4bc94d0 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -63,7 +63,7 @@ int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
 	int err = -ENOMEM;
 
 	/* Use upper 10 bits of VA to index the first level map */
-	pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
+	pd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
 	/* Use middle 10 bits of VA to index the second-level map */
 	if (likely(slab_is_available()))
 		pg = pte_alloc_kernel(pd, va);
@@ -121,53 +121,24 @@ void __init mapin_ram(void)
 	}
 }
 
-/* Scan the real Linux page tables and return a PTE pointer for
- * a virtual address in a context.
- * Returns true (1) if PTE was found, zero otherwise.  The pointer to
- * the PTE pointer is unmodified if PTE is not found.
- */
-static int
-get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
-{
-        pgd_t	*pgd;
-	pud_t	*pud;
-        pmd_t	*pmd;
-        pte_t	*pte;
-        int     retval = 0;
-
-        pgd = pgd_offset(mm, addr & PAGE_MASK);
-        if (pgd) {
-		pud = pud_offset(pgd, addr & PAGE_MASK);
-		if (pud && pud_present(*pud)) {
-			pmd = pmd_offset(pud, addr & PAGE_MASK);
-			if (pmd_present(*pmd)) {
-				pte = pte_offset_map(pmd, addr & PAGE_MASK);
-				if (pte) {
-					retval = 1;
-					*ptep = pte;
-					if (pmdp)
-						*pmdp = pmd;
-					/* XXX caller needs to do pte_unmap, yuck */
-				}
-			}
-		}
-        }
-        return(retval);
-}
-
 static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
 {
 	pte_t *kpte;
 	pmd_t *kpmd;
-	unsigned long address;
+	unsigned long address, va;
 
 	BUG_ON(PageHighMem(page));
 	address = (unsigned long)page_address(page);
+	va = address & PAGE_MASK;
 
 	if (v_block_mapped(address))
 		return 0;
-	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
+
+	kpmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
+	if (!pmd_present(*kpmd))
 		return -EINVAL;
+
+	kpte = pte_offset_map(kpmd, va);
 	__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
 	pte_unmap(kpte);
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e78832dce7bb..1f86a88fd4bb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift);
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /* 4 level page table */
-struct page *pgd_page(pgd_t pgd)
+struct page *p4d_page(p4d_t p4d)
 {
-	if (pgd_is_leaf(pgd)) {
-		VM_WARN_ON(!pgd_huge(pgd));
-		return pte_page(pgd_pte(pgd));
+	if (p4d_is_leaf(p4d)) {
+		VM_WARN_ON(!p4d_huge(p4d));
+		return pte_page(p4d_pte(p4d));
 	}
-	return virt_to_page(pgd_page_vaddr(pgd));
+	return virt_to_page(p4d_page_vaddr(p4d));
 }
 #endif
 
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index a07278027c6f..ac360ad865a8 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
 	}
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
 {
-	pud_t *pud = pud_offset(pgd, 0);
+	pud_t *pud = pud_offset(p4d, 0);
 	unsigned long addr;
 	unsigned int i;
 
@@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
 	}
 }
 
+static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+	p4d_t *p4d = p4d_offset(pgd, 0);
+	unsigned long addr;
+	unsigned int i;
+
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+		addr = start + i * P4D_SIZE;
+		if (!p4d_none(*p4d))
+			/* p4d exists */
+			walk_pud(st, p4d, addr);
+	}
+}
+
 static void walk_pagetables(struct pg_state *st)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
 		addr = KERN_VIRT_START + i * PGDIR_SIZE;
 		if (!pgd_none(*pgd))
 			/* pgd exists */
-			walk_pud(st, pgd, addr);
+			walk_p4d(st, pgd, addr);
 	}
 }
 
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 206156255247..9d6256b61df3 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
 	}
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
 {
-	pud_t *pud = pud_offset(pgd, 0);
+	pud_t *pud = pud_offset(p4d, 0);
 	unsigned long addr;
 	unsigned int i;
 
@@ -304,11 +304,13 @@ static void walk_pagetables(struct pg_state *st)
 	 * the hash pagetable.
 	 */
 	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
-		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
-			/* pgd exists */
-			walk_pud(st, pgd, addr);
+		p4d_t *p4d = p4d_offset(pgd, 0);
+
+		if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
+			/* p4d exists */
+			walk_pud(st, p4d, addr);
 		else
-			note_page(st, addr, 1, pgd_val(*pgd));
+			note_page(st, addr, 1, p4d_val(*p4d));
 	}
 }
 
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 0ec9640335bb..3e29128c58cc 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3130,6 +3130,7 @@ static void show_pte(unsigned long addr)
 	struct task_struct *tsk = NULL;
 	struct mm_struct *mm;
 	pgd_t *pgdp, *pgdir;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -3161,20 +3162,21 @@ static void show_pte(unsigned long addr)
 		pgdir = pgd_offset(mm, 0);
 	}
 
-	if (pgd_none(*pgdp)) {
-		printf("no linux page table for address\n");
+	p4dp = p4d_offset(pgdp, addr);
+
+	if (p4d_none(*p4dp)) {
+		printf("No valid P4D\n");
 		return;
 	}
 
-	printf("pgd  @ 0x%px\n", pgdir);
-
-	if (pgd_is_leaf(*pgdp)) {
-		format_pte(pgdp, pgd_val(*pgdp));
+	if (p4d_is_leaf(*p4dp)) {
+		format_pte(p4dp, p4d_val(*p4dp));
 		return;
 	}
-	printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp));
 
-	pudp = pud_offset(pgdp, addr);
+	printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp));
+
+	pudp = pud_offset(p4dp, addr);
 
 	if (pud_none(*pudp)) {
 		printf("No valid PUD\n");
Christophe Leroy Feb. 26, 2020, 9:46 a.m. UTC | #6
Le 26/02/2020 à 10:13, Mike Rapoport a écrit :
> On Tue, Feb 18, 2020 at 12:54:40PM +0200, Mike Rapoport wrote:
>> On Sun, Feb 16, 2020 at 11:41:07AM +0100, Christophe Leroy wrote:
>>>
>>>
>>> Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
>>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>>
>>>> Implement primitives necessary for the 4th level folding, add walks of p4d
>>>> level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
>>>
>>> I don't think it is worth adding all this additionnals walks of p4d, this
>>> patch could be limited to changes like:
>>>
>>> -		pud = pud_offset(pgd, gpa);
>>> +		pud = pud_offset(p4d_offset(pgd, gpa), gpa);
>>>
>>> The additionnal walks should be added through another patch the day powerpc
>>> need them.
>>
>> Ok, I'll update the patch to reduce walking the p4d.
> 
> Here's what I have with more direct acceses from pgd to pud.

I went quickly through. This looks promising.

Do we need the walk_p4d() in arch/powerpc/mm/ptdump/hashpagetable.c ?
Can't we just do

@@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
  		addr = KERN_VIRT_START + i * PGDIR_SIZE;
  		if (!pgd_none(*pgd))
  			/* pgd exists */
-			walk_pud(st, pgd, addr);
+			walk_pud(st, p4d_offset(pgd, addr), addr);
  	}
  }



Also, I think the removal of get_pteptr() should be a patch by itself.

You could include my patches in your series. See 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=152102

Christophe



> 
>  From 6c59a86ce8394fb6100e9b6ced2e346981fb0ce9 Mon Sep 17 00:00:00 2001
> From: Mike Rapoport <rppt@linux.ibm.com>
> Date: Sun, 24 Nov 2019 15:38:00 +0200
> Subject: [PATCH v3] powerpc: add support for folded p4d page tables
> 
> Implement primitives necessary for the 4th level folding, add walks of p4d
> level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
> ---
> v3:
> * reduce amount of added p4d walks
> * kill pgtable_32::get_pteptr and traverse page table in
>    pgtable_32::__change_page_attr_noflush
> 
> 
>   arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
>   arch/powerpc/include/asm/book3s/64/hash.h     |  4 +-
>   arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
>   arch/powerpc/include/asm/book3s/64/pgtable.h  | 60 ++++++++++---------
>   arch/powerpc/include/asm/book3s/64/radix.h    |  6 +-
>   arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
>   arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
>   .../include/asm/nohash/64/pgtable-4k.h        | 32 +++++-----
>   arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
>   arch/powerpc/include/asm/pgtable.h            |  6 +-
>   arch/powerpc/kvm/book3s_64_mmu_radix.c        | 30 ++++++----
>   arch/powerpc/lib/code-patching.c              |  7 ++-
>   arch/powerpc/mm/book3s32/mmu.c                |  2 +-
>   arch/powerpc/mm/book3s32/tlb.c                |  4 +-
>   arch/powerpc/mm/book3s64/hash_pgtable.c       |  4 +-
>   arch/powerpc/mm/book3s64/radix_pgtable.c      | 26 +++++---
>   arch/powerpc/mm/book3s64/subpage_prot.c       |  6 +-
>   arch/powerpc/mm/hugetlbpage.c                 | 28 +++++----
>   arch/powerpc/mm/kasan/kasan_init_32.c         |  8 +--
>   arch/powerpc/mm/mem.c                         |  4 +-
>   arch/powerpc/mm/nohash/40x.c                  |  4 +-
>   arch/powerpc/mm/nohash/book3e_pgtable.c       | 15 ++---
>   arch/powerpc/mm/pgtable.c                     | 30 ++++++----
>   arch/powerpc/mm/pgtable_32.c                  | 45 +++-----------
>   arch/powerpc/mm/pgtable_64.c                  | 10 ++--
>   arch/powerpc/mm/ptdump/hashpagetable.c        | 20 ++++++-
>   arch/powerpc/mm/ptdump/ptdump.c               | 14 +++--
>   arch/powerpc/xmon/xmon.c                      | 18 +++---
>   28 files changed, 213 insertions(+), 184 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 5b39c11e884a..39ec11371be0 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -2,7 +2,6 @@
>   #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
>   #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
>   
> -#define __ARCH_USE_5LEVEL_HACK
>   #include <asm-generic/pgtable-nopmd.h>
>   
>   #include <asm/book3s/32/hash.h>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> index 2781ebf6add4..876d1528c2cf 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea)
>   
>   #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
>   #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)
> -static inline int hash__pgd_bad(pgd_t pgd)
> +static inline int hash__p4d_bad(p4d_t p4d)
>   {
> -	return (pgd_val(pgd) == 0);
> +	return (p4d_val(p4d) == 0);
>   }
>   #ifdef CONFIG_STRICT_KERNEL_RWX
>   extern void hash__mark_rodata_ro(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> index a41e91bd0580..69c5b051734f 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> @@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
>   	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
>   }
>   
> -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
>   {
> -	*pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
> +	*pgd =  __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
>   }
>   
>   static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 201a69e6a355..fa60e8594b9f 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -2,7 +2,7 @@
>   #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
>   #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
>   
> -#include <asm-generic/5level-fixup.h>
> +#include <asm-generic/pgtable-nop4d.h>
>   
>   #ifndef __ASSEMBLY__
>   #include <linux/mmdebug.h>
> @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift;
>   /* Bits to mask out from a PUD to get to the PMD page */
>   #define PUD_MASKED_BITS		0xc0000000000000ffUL
>   /* Bits to mask out from a PGD to get to the PUD page */
> -#define PGD_MASKED_BITS		0xc0000000000000ffUL
> +#define P4D_MASKED_BITS		0xc0000000000000ffUL
>   
>   /*
>    * Used as an indicator for rcu callback functions
> @@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
>   	return pte_access_permitted(pud_pte(pud), write);
>   }
>   
> -#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
> +#define __p4d_raw(x)	((p4d_t) { __pgd_raw(x) })
> +static inline __be64 p4d_raw(p4d_t x)
> +{
> +	return pgd_raw(x.pgd);
> +}
> +
> +#define p4d_write(p4d)		pte_write(p4d_pte(p4d))
>   
> -static inline void pgd_clear(pgd_t *pgdp)
> +static inline void p4d_clear(p4d_t *p4dp)
>   {
> -	*pgdp = __pgd(0);
> +	*p4dp = __p4d(0);
>   }
>   
> -static inline int pgd_none(pgd_t pgd)
> +static inline int p4d_none(p4d_t p4d)
>   {
> -	return !pgd_raw(pgd);
> +	return !p4d_raw(p4d);
>   }
>   
> -static inline int pgd_present(pgd_t pgd)
> +static inline int p4d_present(p4d_t p4d)
>   {
> -	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
> +	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT));
>   }
>   
> -static inline pte_t pgd_pte(pgd_t pgd)
> +static inline pte_t p4d_pte(p4d_t p4d)
>   {
> -	return __pte_raw(pgd_raw(pgd));
> +	return __pte_raw(p4d_raw(p4d));
>   }
>   
> -static inline pgd_t pte_pgd(pte_t pte)
> +static inline p4d_t pte_p4d(pte_t pte)
>   {
> -	return __pgd_raw(pte_raw(pte));
> +	return __p4d_raw(pte_raw(pte));
>   }
>   
> -static inline int pgd_bad(pgd_t pgd)
> +static inline int p4d_bad(p4d_t p4d)
>   {
>   	if (radix_enabled())
> -		return radix__pgd_bad(pgd);
> -	return hash__pgd_bad(pgd);
> +		return radix__p4d_bad(p4d);
> +	return hash__p4d_bad(p4d);
>   }
>   
> -#define pgd_access_permitted pgd_access_permitted
> -static inline bool pgd_access_permitted(pgd_t pgd, bool write)
> +#define p4d_access_permitted p4d_access_permitted
> +static inline bool p4d_access_permitted(p4d_t p4d, bool write)
>   {
> -	return pte_access_permitted(pgd_pte(pgd), write);
> +	return pte_access_permitted(p4d_pte(p4d), write);
>   }
>   
> -extern struct page *pgd_page(pgd_t pgd);
> +extern struct page *p4d_page(p4d_t p4d);
>   
>   /* Pointers in the page table tree are physical addresses */
>   #define __pgtable_ptr_val(ptr)	__pa(ptr)
>   
>   #define pmd_page_vaddr(pmd)	__va(pmd_val(pmd) & ~PMD_MASKED_BITS)
>   #define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
> -#define pgd_page_vaddr(pgd)	__va(pgd_val(pgd) & ~PGD_MASKED_BITS)
> +#define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
>   
>   #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
>   #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
> @@ -1010,8 +1016,8 @@ extern struct page *pgd_page(pgd_t pgd);
>   
>   #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
>   
> -#define pud_offset(pgdp, addr)	\
> -	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
> +#define pud_offset(p4dp, addr)	\
> +	(((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr))
>   #define pmd_offset(pudp,addr) \
>   	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
>   #define pte_offset_kernel(dir,addr) \
> @@ -1368,11 +1374,11 @@ static inline bool pud_is_leaf(pud_t pud)
>   	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
>   }
>   
> -#define pgd_is_leaf pgd_is_leaf
> -#define pgd_leaf pgd_is_leaf
> -static inline bool pgd_is_leaf(pgd_t pgd)
> +#define p4d_is_leaf p4d_is_leaf
> +#define p4d_leaf p4d_is_leaf
> +static inline bool p4d_is_leaf(p4d_t p4d)
>   {
> -	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
> +	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE));
>   }
>   
>   #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index d97db3ad9aae..9bca2ac64220 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -30,7 +30,7 @@
>   /* Don't have anything in the reserved bits and leaf bits */
>   #define RADIX_PMD_BAD_BITS		0x60000000000000e0UL
>   #define RADIX_PUD_BAD_BITS		0x60000000000000e0UL
> -#define RADIX_PGD_BAD_BITS		0x60000000000000e0UL
> +#define RADIX_P4D_BAD_BITS		0x60000000000000e0UL
>   
>   #define RADIX_PMD_SHIFT		(PAGE_SHIFT + RADIX_PTE_INDEX_SIZE)
>   #define RADIX_PUD_SHIFT		(RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE)
> @@ -227,9 +227,9 @@ static inline int radix__pud_bad(pud_t pud)
>   }
>   
>   
> -static inline int radix__pgd_bad(pgd_t pgd)
> +static inline int radix__p4d_bad(p4d_t p4d)
>   {
> -	return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
> +	return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS);
>   }
>   
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
> index 60c4d829152e..d4c2c4259fa3 100644
> --- a/arch/powerpc/include/asm/nohash/32/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
> @@ -2,7 +2,6 @@
>   #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H
>   #define _ASM_POWERPC_NOHASH_32_PGTABLE_H
>   
> -#define __ARCH_USE_5LEVEL_HACK
>   #include <asm-generic/pgtable-nopmd.h>
>   
>   #ifndef __ASSEMBLY__
> diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
> index b9534a793293..668aee6017e7 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
> @@ -15,7 +15,7 @@ struct vmemmap_backing {
>   };
>   extern struct vmemmap_backing *vmemmap_list;
>   
> -#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
> +#define p4d_populate(MM, P4D, PUD)	p4d_set(P4D, (unsigned long)PUD)
>   
>   static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
>   {
> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
> index c40ec32b8194..81b1c54e3cf1 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
> @@ -2,7 +2,7 @@
>   #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
>   #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
>   
> -#include <asm-generic/5level-fixup.h>
> +#include <asm-generic/pgtable-nop4d.h>
>   
>   /*
>    * Entries per page directory level.  The PTE level must use a 64b record
> @@ -45,41 +45,41 @@
>   #define PMD_MASKED_BITS		0
>   /* Bits to mask out from a PUD to get to the PMD page */
>   #define PUD_MASKED_BITS		0
> -/* Bits to mask out from a PGD to get to the PUD page */
> -#define PGD_MASKED_BITS		0
> +/* Bits to mask out from a P4D to get to the PUD page */
> +#define P4D_MASKED_BITS		0
>   
>   
>   /*
>    * 4-level page tables related bits
>    */
>   
> -#define pgd_none(pgd)		(!pgd_val(pgd))
> -#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
> -#define pgd_present(pgd)	(pgd_val(pgd) != 0)
> -#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
> +#define p4d_none(p4d)		(!p4d_val(p4d))
> +#define p4d_bad(p4d)		(p4d_val(p4d) == 0)
> +#define p4d_present(p4d)	(p4d_val(p4d) != 0)
> +#define p4d_page_vaddr(p4d)	(p4d_val(p4d) & ~P4D_MASKED_BITS)
>   
>   #ifndef __ASSEMBLY__
>   
> -static inline void pgd_clear(pgd_t *pgdp)
> +static inline void p4d_clear(p4d_t *p4dp)
>   {
> -	*pgdp = __pgd(0);
> +	*p4dp = __p4d(0);
>   }
>   
> -static inline pte_t pgd_pte(pgd_t pgd)
> +static inline pte_t p4d_pte(p4d_t p4d)
>   {
> -	return __pte(pgd_val(pgd));
> +	return __pte(p4d_val(p4d));
>   }
>   
> -static inline pgd_t pte_pgd(pte_t pte)
> +static inline p4d_t pte_p4d(pte_t pte)
>   {
> -	return __pgd(pte_val(pte));
> +	return __p4d(pte_val(pte));
>   }
> -extern struct page *pgd_page(pgd_t pgd);
> +extern struct page *p4d_page(p4d_t p4d);
>   
>   #endif /* !__ASSEMBLY__ */
>   
> -#define pud_offset(pgdp, addr)	\
> -  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
> +#define pud_offset(p4dp, addr)	\
> +  (((pud_t *) p4d_page_vaddr(*(p4dp))) + \
>       (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
>   
>   #define pud_ERROR(e) \
> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
> index 9a33b8bd842d..b360f262b9c6 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
> @@ -175,11 +175,11 @@ static inline pud_t pte_pud(pte_t pte)
>   	return __pud(pte_val(pte));
>   }
>   #define pud_write(pud)		pte_write(pud_pte(pud))
> -#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
> +#define p4d_write(pgd)		pte_write(p4d_pte(p4d))
>   
> -static inline void pgd_set(pgd_t *pgdp, unsigned long val)
> +static inline void p4d_set(p4d_t *p4dp, unsigned long val)
>   {
> -	*pgdp = __pgd(val);
> +	*p4dp = __p4d(val);
>   }
>   
>   /*
> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 8cc543ed114c..05205d7a7b4a 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -139,9 +139,9 @@ static inline bool pud_is_leaf(pud_t pud)
>   }
>   #endif
>   
> -#ifndef pgd_is_leaf
> -#define pgd_is_leaf pgd_is_leaf
> -static inline bool pgd_is_leaf(pgd_t pgd)
> +#ifndef p4d_is_leaf
> +#define p4d_is_leaf p4d_is_leaf
> +static inline bool p4d_is_leaf(p4d_t p4d)
>   {
>   	return false;
>   }
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 803940d79b73..beb694285100 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -499,13 +499,14 @@ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
>   	unsigned long ig;
>   
>   	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
> +		p4d_t *p4d = p4d_offset(pgd, 0);
>   		pud_t *pud;
>   
> -		if (!pgd_present(*pgd))
> +		if (!p4d_present(*p4d))
>   			continue;
> -		pud = pud_offset(pgd, 0);
> +		pud = pud_offset(p4d, 0);
>   		kvmppc_unmap_free_pud(kvm, pud, lpid);
> -		pgd_clear(pgd);
> +		p4d_clear(p4d);
>   	}
>   }
>   
> @@ -566,6 +567,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>   		      unsigned long *rmapp, struct rmap_nested **n_rmap)
>   {
>   	pgd_t *pgd;
> +	p4d_t *p4d;
>   	pud_t *pud, *new_pud = NULL;
>   	pmd_t *pmd, *new_pmd = NULL;
>   	pte_t *ptep, *new_ptep = NULL;
> @@ -573,9 +575,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>   
>   	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
>   	pgd = pgtable + pgd_index(gpa);
> +	p4d = p4d_offset(pgd, gpa);
> +
>   	pud = NULL;
> -	if (pgd_present(*pgd))
> -		pud = pud_offset(pgd, gpa);
> +	if (p4d_present(*p4d))
> +		pud = pud_offset(p4d, gpa);
>   	else
>   		new_pud = pud_alloc_one(kvm->mm, gpa);
>   
> @@ -596,13 +600,13 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>   
>   	/* Now traverse again under the lock and change the tree */
>   	ret = -ENOMEM;
> -	if (pgd_none(*pgd)) {
> +	if (p4d_none(*p4d)) {
>   		if (!new_pud)
>   			goto out_unlock;
> -		pgd_populate(kvm->mm, pgd, new_pud);
> +		p4d_populate(kvm->mm, p4d, new_pud);
>   		new_pud = NULL;
>   	}
> -	pud = pud_offset(pgd, gpa);
> +	pud = pud_offset(p4d, gpa);
>   	if (pud_is_leaf(*pud)) {
>   		unsigned long hgpa = gpa & PUD_MASK;
>   
> @@ -1220,6 +1224,7 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
>   	pgd_t *pgt;
>   	struct kvm_nested_guest *nested;
>   	pgd_t pgd, *pgdp;
> +	p4d_t p4d, *p4dp;
>   	pud_t pud, *pudp;
>   	pmd_t pmd, *pmdp;
>   	pte_t *ptep;
> @@ -1292,13 +1297,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
>   		}
>   
>   		pgdp = pgt + pgd_index(gpa);
> -		pgd = READ_ONCE(*pgdp);
> -		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
> -			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
> +		p4dp = p4d_offset(pgdp, gpa);
> +		p4d = READ_ONCE(*p4dp);
> +		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
> +			gpa = (gpa & P4D_MASK) + P4D_SIZE;
>   			continue;
>   		}
>   
> -		pudp = pud_offset(&pgd, gpa);
> +		pudp = pud_offset(&p4d, gpa);
>   		pud = READ_ONCE(*pudp);
>   		if (!(pud_val(pud) & _PAGE_PRESENT)) {
>   			gpa = (gpa & PUD_MASK) + PUD_SIZE;
> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> index 3345f039a876..7a59f6863cec 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr)
>   	pte_t *ptep;
>   	pmd_t *pmdp;
>   	pud_t *pudp;
> +	p4d_t *p4dp;
>   	pgd_t *pgdp;
>   
>   	pgdp = pgd_offset_k(addr);
>   	if (unlikely(!pgdp))
>   		return -EINVAL;
>   
> -	pudp = pud_offset(pgdp, addr);
> +	p4dp = p4d_offset(pgdp, addr);
> +	if (unlikely(!p4dp))
> +		return -EINVAL;
> +
> +	pudp = pud_offset(p4dp, addr);
>   	if (unlikely(!pudp))
>   		return -EINVAL;
>   
> diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> index f888cbb109b9..edef17c97206 100644
> --- a/arch/powerpc/mm/book3s32/mmu.c
> +++ b/arch/powerpc/mm/book3s32/mmu.c
> @@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
>   
>   	if (!Hash)
>   		return;
> -	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);
>   	if (!pmd_none(*pmd))
>   		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
>   }
> diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
> index 2fcd321040ff..175bc33b41b7 100644
> --- a/arch/powerpc/mm/book3s32/tlb.c
> +++ b/arch/powerpc/mm/book3s32/tlb.c
> @@ -87,7 +87,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
>   	if (start >= end)
>   		return;
>   	end = (end - 1) | ~PAGE_MASK;
> -	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start);
>   	for (;;) {
>   		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
>   		if (pmd_end > end)
> @@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
>   		return;
>   	}
>   	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
> -	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr);
>   	if (!pmd_none(*pmd))
>   		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
>   }
> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> index 64733b9cb20a..9cd15937e88a 100644
> --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
>   int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   {
>   	pgd_t *pgdp;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
> @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
>   	if (slab_is_available()) {
>   		pgdp = pgd_offset_k(ea);
> -		pudp = pud_alloc(&init_mm, pgdp, ea);
> +		p4dp = p4d_offset(pgdp, ea);
> +		pudp = pud_alloc(&init_mm, p4dp, ea);
>   		if (!pudp)
>   			return -ENOMEM;
>   		pmdp = pmd_alloc(&init_mm, pudp, ea);
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index dd1bea45325c..fc3d0b0460b0 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -64,17 +64,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>   {
>   	unsigned long pfn = pa >> PAGE_SHIFT;
>   	pgd_t *pgdp;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
>   
>   	pgdp = pgd_offset_k(ea);
> -	if (pgd_none(*pgdp)) {
> +	p4dp = p4d_offset(pgdp, ea);
> +	if (p4d_none(*p4dp)) {
>   		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
>   						region_start, region_end);
> -		pgd_populate(&init_mm, pgdp, pudp);
> +		p4d_populate(&init_mm, p4dp, pudp);
>   	}
> -	pudp = pud_offset(pgdp, ea);
> +	pudp = pud_offset(p4dp, ea);
>   	if (map_page_size == PUD_SIZE) {
>   		ptep = (pte_t *)pudp;
>   		goto set_the_pte;
> @@ -114,6 +116,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
>   {
>   	unsigned long pfn = pa >> PAGE_SHIFT;
>   	pgd_t *pgdp;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
> @@ -136,7 +139,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
>   	 * boot.
>   	 */
>   	pgdp = pgd_offset_k(ea);
> -	pudp = pud_alloc(&init_mm, pgdp, ea);
> +	p4dp = p4d_offset(pgdp, ea);
> +	pudp = pud_alloc(&init_mm, p4dp, ea);
>   	if (!pudp)
>   		return -ENOMEM;
>   	if (map_page_size == PUD_SIZE) {
> @@ -173,6 +177,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
>   {
>   	unsigned long idx;
>   	pgd_t *pgdp;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
> @@ -185,7 +190,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
>   
>   	for (idx = start; idx < end; idx += PAGE_SIZE) {
>   		pgdp = pgd_offset_k(idx);
> -		pudp = pud_alloc(&init_mm, pgdp, idx);
> +		p4dp = p4d_offset(pgdp, idx);
> +		pudp = pud_alloc(&init_mm, p4dp, idx);
>   		if (!pudp)
>   			continue;
>   		if (pud_is_leaf(*pudp)) {
> @@ -847,6 +853,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
>   	unsigned long addr, next;
>   	pud_t *pud_base;
>   	pgd_t *pgd;
> +	p4d_t *p4d;
>   
>   	spin_lock(&init_mm.page_table_lock);
>   
> @@ -854,15 +861,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
>   		next = pgd_addr_end(addr, end);
>   
>   		pgd = pgd_offset_k(addr);
> -		if (!pgd_present(*pgd))
> +		p4d = p4d_offset(pgd, addr);
> +		if (!p4d_present(*p4d))
>   			continue;
>   
> -		if (pgd_is_leaf(*pgd)) {
> -			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
> +		if (p4d_is_leaf(*p4d)) {
> +			split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d);
>   			continue;
>   		}
>   
> -		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
> +		pud_base = (pud_t *)p4d_page_vaddr(*p4d);
>   		remove_pud_table(pud_base, addr, next);
>   	}
>   
> diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
> index 2ef24a53f4c9..25a0c044bd93 100644
> --- a/arch/powerpc/mm/book3s64/subpage_prot.c
> +++ b/arch/powerpc/mm/book3s64/subpage_prot.c
> @@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
>   			     int npages)
>   {
>   	pgd_t *pgd;
> +	p4d_t *p4d;
>   	pud_t *pud;
>   	pmd_t *pmd;
>   	pte_t *pte;
>   	spinlock_t *ptl;
>   
>   	pgd = pgd_offset(mm, addr);
> -	if (pgd_none(*pgd))
> +	p4d = p4d_offset(pgd, addr);
> +	if (p4d_none(*p4d))
>   		return;
> -	pud = pud_offset(pgd, addr);
> +	pud = pud_offset(p4d, addr);
>   	if (pud_none(*pud))
>   		return;
>   	pmd = pmd_offset(pud, addr);
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 33b3461d91e8..54f5994d4cbb 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -119,6 +119,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
>   pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
>   {
>   	pgd_t *pg;
> +	p4d_t *p4;
>   	pud_t *pu;
>   	pmd_t *pm;
>   	hugepd_t *hpdp = NULL;
> @@ -128,20 +129,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
>   
>   	addr &= ~(sz-1);
>   	pg = pgd_offset(mm, addr);
> +	p4 = p4d_offset(pg, addr);
>   
>   #ifdef CONFIG_PPC_BOOK3S_64
>   	if (pshift == PGDIR_SHIFT)
>   		/* 16GB huge page */
> -		return (pte_t *) pg;
> +		return (pte_t *) p4;
>   	else if (pshift > PUD_SHIFT) {
>   		/*
>   		 * We need to use hugepd table
>   		 */
>   		ptl = &mm->page_table_lock;
> -		hpdp = (hugepd_t *)pg;
> +		hpdp = (hugepd_t *)p4;
>   	} else {
>   		pdshift = PUD_SHIFT;
> -		pu = pud_alloc(mm, pg, addr);
> +		pu = pud_alloc(mm, p4, addr);
>   		if (!pu)
>   			return NULL;
>   		if (pshift == PUD_SHIFT)
> @@ -166,10 +168,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
>   #else
>   	if (pshift >= PGDIR_SHIFT) {
>   		ptl = &mm->page_table_lock;
> -		hpdp = (hugepd_t *)pg;
> +		hpdp = (hugepd_t *)p4;
>   	} else {
>   		pdshift = PUD_SHIFT;
> -		pu = pud_alloc(mm, pg, addr);
> +		pu = pud_alloc(mm, p4, addr);
>   		if (!pu)
>   			return NULL;
>   		if (pshift >= PUD_SHIFT) {
> @@ -390,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
>   	mm_dec_nr_pmds(tlb->mm);
>   }
>   
> -static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
> +static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
>   				   unsigned long addr, unsigned long end,
>   				   unsigned long floor, unsigned long ceiling)
>   {
> @@ -400,7 +402,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
>   
>   	start = addr;
>   	do {
> -		pud = pud_offset(pgd, addr);
> +		pud = pud_offset(p4d, addr);
>   		next = pud_addr_end(addr, end);
>   		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
>   			if (pud_none_or_clear_bad(pud))
> @@ -435,8 +437,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
>   	if (end - 1 > ceiling - 1)
>   		return;
>   
> -	pud = pud_offset(pgd, start);
> -	pgd_clear(pgd);
> +	pud = pud_offset(p4d, start);
> +	p4d_clear(p4d);
>   	pud_free_tlb(tlb, pud, start);
>   	mm_dec_nr_puds(tlb->mm);
>   }
> @@ -449,6 +451,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>   			    unsigned long floor, unsigned long ceiling)
>   {
>   	pgd_t *pgd;
> +	p4d_t *p4d;
>   	unsigned long next;
>   
>   	/*
> @@ -471,10 +474,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>   	do {
>   		next = pgd_addr_end(addr, end);
>   		pgd = pgd_offset(tlb->mm, addr);
> +		p4d = p4d_offset(pgd, addr);
>   		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -			if (pgd_none_or_clear_bad(pgd))
> +			if (p4d_none_or_clear_bad(p4d))
>   				continue;
> -			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
> +			hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
>   		} else {
>   			unsigned long more;
>   			/*
> @@ -487,7 +491,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>   			if (more > next)
>   				next = more;
>   
> -			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
> +			free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
>   					  addr, next, floor, ceiling);
>   		}
>   	} while (addr = next, addr != end);
> diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
> index db5664dde5ff..88e2e16380b5 100644
> --- a/arch/powerpc/mm/kasan/kasan_init_32.c
> +++ b/arch/powerpc/mm/kasan/kasan_init_32.c
> @@ -36,7 +36,7 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned
>   	unsigned long k_cur, k_next;
>   	pte_t *new = NULL;
>   
> -	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
> +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_start), k_start), k_start), k_start);
>   
>   	for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
>   		k_next = pgd_addr_end(k_cur, k_end);
> @@ -78,7 +78,7 @@ static int __init kasan_init_region(void *start, size_t size)
>   	block = memblock_alloc(k_end - k_start, PAGE_SIZE);
>   
>   	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
> -		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
> +		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
>   		void *va = block + k_cur - k_start;
>   		pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
>   
> @@ -102,7 +102,7 @@ static void __init kasan_remap_early_shadow_ro(void)
>   	kasan_populate_pte(kasan_early_shadow_pte, prot);
>   
>   	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
> -		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
> +		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
>   		pte_t *ptep = pte_offset_kernel(pmd, k_cur);
>   
>   		if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
> @@ -201,7 +201,7 @@ void __init kasan_early_init(void)
>   	unsigned long addr = KASAN_SHADOW_START;
>   	unsigned long end = KASAN_SHADOW_END;
>   	unsigned long next;
> -	pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
> +	pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(addr), addr), addr), addr);
>   
>   	BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
>   
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index ef7b1119b2e2..8262b384dcf3 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -69,8 +69,8 @@ EXPORT_SYMBOL(kmap_prot);
>   
>   static inline pte_t *virt_to_kpte(unsigned long vaddr)
>   {
> -	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
> -			vaddr), vaddr), vaddr);
> +	return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr),
> +			vaddr), vaddr), vaddr), vaddr);
>   }
>   #endif
>   
> diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
> index f348104eb461..7aaf7155e350 100644
> --- a/arch/powerpc/mm/nohash/40x.c
> +++ b/arch/powerpc/mm/nohash/40x.c
> @@ -104,7 +104,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
>   		pmd_t *pmdp;
>   		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
>   
> -		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
> +		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
>   		*pmdp++ = __pmd(val);
>   		*pmdp++ = __pmd(val);
>   		*pmdp++ = __pmd(val);
> @@ -119,7 +119,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
>   		pmd_t *pmdp;
>   		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
>   
> -		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
> +		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
>   		*pmdp = __pmd(val);
>   
>   		v += LARGE_PAGE_SIZE_4M;
> diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
> index 4637fdd469cf..77884e24281d 100644
> --- a/arch/powerpc/mm/nohash/book3e_pgtable.c
> +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
> @@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size)
>   int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   {
>   	pgd_t *pgdp;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
> @@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   	BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
>   	if (slab_is_available()) {
>   		pgdp = pgd_offset_k(ea);
> -		pudp = pud_alloc(&init_mm, pgdp, ea);
> +		p4dp = p4d_offset(pgdp, ea);
> +		pudp = pud_alloc(&init_mm, p4dp, ea);
>   		if (!pudp)
>   			return -ENOMEM;
>   		pmdp = pmd_alloc(&init_mm, pudp, ea);
> @@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
>   			return -ENOMEM;
>   	} else {
>   		pgdp = pgd_offset_k(ea);
> -#ifndef __PAGETABLE_PUD_FOLDED
> -		if (pgd_none(*pgdp)) {
> -			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
> -			pgd_populate(&init_mm, pgdp, pudp);
> +		p4dp = p4d_offset(pgdp, ea);
> +		if (p4d_none(*p4dp)) {
> +			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
> +			p4d_populate(&init_mm, p4dp, pmdp);
>   		}
> -#endif /* !__PAGETABLE_PUD_FOLDED */
> -		pudp = pud_offset(pgdp, ea);
> +		pudp = pud_offset(p4dp, ea);
>   		if (pud_none(*pudp)) {
>   			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
>   			pud_populate(&init_mm, pudp, pmdp);
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index e3759b69f81b..c2499271f6c1 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -265,6 +265,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>   void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
>   {
>   	pgd_t *pgd;
> +	p4d_t *p4d;
>   	pud_t *pud;
>   	pmd_t *pmd;
>   
> @@ -272,7 +273,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
>   		return;
>   	pgd = mm->pgd + pgd_index(addr);
>   	BUG_ON(pgd_none(*pgd));
> -	pud = pud_offset(pgd, addr);
> +	p4d = p4d_offset(pgd, addr);
> +	BUG_ON(p4d_none(*p4d));
> +	pud = pud_offset(p4d, addr);
>   	BUG_ON(pud_none(*pud));
>   	pmd = pmd_offset(pud, addr);
>   	/*
> @@ -312,12 +315,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys);
>   pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
>   			bool *is_thp, unsigned *hpage_shift)
>   {
> -	pgd_t pgd, *pgdp;
> +	pgd_t *pgdp;
> +	p4d_t p4d, *p4dp;
>   	pud_t pud, *pudp;
>   	pmd_t pmd, *pmdp;
>   	pte_t *ret_pte;
>   	hugepd_t *hpdp = NULL;
> -	unsigned pdshift = PGDIR_SHIFT;
> +	unsigned pdshift;
>   
>   	if (hpage_shift)
>   		*hpage_shift = 0;
> @@ -325,24 +329,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
>   	if (is_thp)
>   		*is_thp = false;
>   
> -	pgdp = pgdir + pgd_index(ea);
> -	pgd  = READ_ONCE(*pgdp);
>   	/*
>   	 * Always operate on the local stack value. This make sure the
>   	 * value don't get updated by a parallel THP split/collapse,
>   	 * page fault or a page unmap. The return pte_t * is still not
>   	 * stable. So should be checked there for above conditions.
> +	 * Top level is an exception because it is folded into p4d.
>   	 */
> -	if (pgd_none(pgd))
> +	pgdp = pgdir + pgd_index(ea);
> +	p4dp = p4d_offset(pgdp, ea);
> +	p4d  = READ_ONCE(*p4dp);
> +	pdshift = P4D_SHIFT;
> +
> +	if (p4d_none(p4d))
>   		return NULL;
>   
> -	if (pgd_is_leaf(pgd)) {
> -		ret_pte = (pte_t *)pgdp;
> +	if (p4d_is_leaf(p4d)) {
> +		ret_pte = (pte_t *)p4dp;
>   		goto out;
>   	}
>   
> -	if (is_hugepd(__hugepd(pgd_val(pgd)))) {
> -		hpdp = (hugepd_t *)&pgd;
> +	if (is_hugepd(__hugepd(p4d_val(p4d)))) {
> +		hpdp = (hugepd_t *)&p4d;
>   		goto out_huge;
>   	}
>   
> @@ -352,7 +360,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
>   	 * irq disabled
>   	 */
>   	pdshift = PUD_SHIFT;
> -	pudp = pud_offset(&pgd, ea);
> +	pudp = pud_offset(&p4d, ea);
>   	pud  = READ_ONCE(*pudp);
>   
>   	if (pud_none(pud))
> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> index 5fb90edd865e..5774d4bc94d0 100644
> --- a/arch/powerpc/mm/pgtable_32.c
> +++ b/arch/powerpc/mm/pgtable_32.c
> @@ -63,7 +63,7 @@ int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
>   	int err = -ENOMEM;
>   
>   	/* Use upper 10 bits of VA to index the first level map */
> -	pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
> +	pd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
>   	/* Use middle 10 bits of VA to index the second-level map */
>   	if (likely(slab_is_available()))
>   		pg = pte_alloc_kernel(pd, va);
> @@ -121,53 +121,24 @@ void __init mapin_ram(void)
>   	}
>   }
>   
> -/* Scan the real Linux page tables and return a PTE pointer for
> - * a virtual address in a context.
> - * Returns true (1) if PTE was found, zero otherwise.  The pointer to
> - * the PTE pointer is unmodified if PTE is not found.
> - */
> -static int
> -get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
> -{
> -        pgd_t	*pgd;
> -	pud_t	*pud;
> -        pmd_t	*pmd;
> -        pte_t	*pte;
> -        int     retval = 0;
> -
> -        pgd = pgd_offset(mm, addr & PAGE_MASK);
> -        if (pgd) {
> -		pud = pud_offset(pgd, addr & PAGE_MASK);
> -		if (pud && pud_present(*pud)) {
> -			pmd = pmd_offset(pud, addr & PAGE_MASK);
> -			if (pmd_present(*pmd)) {
> -				pte = pte_offset_map(pmd, addr & PAGE_MASK);
> -				if (pte) {
> -					retval = 1;
> -					*ptep = pte;
> -					if (pmdp)
> -						*pmdp = pmd;
> -					/* XXX caller needs to do pte_unmap, yuck */
> -				}
> -			}
> -		}
> -        }
> -        return(retval);
> -}
> -
>   static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
>   {
>   	pte_t *kpte;
>   	pmd_t *kpmd;
> -	unsigned long address;
> +	unsigned long address, va;
>   
>   	BUG_ON(PageHighMem(page));
>   	address = (unsigned long)page_address(page);
> +	va = address & PAGE_MASK;
>   
>   	if (v_block_mapped(address))
>   		return 0;
> -	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
> +
> +	kpmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
> +	if (!pmd_present(*kpmd))
>   		return -EINVAL;
> +
> +	kpte = pte_offset_map(kpmd, va);
>   	__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
>   	pte_unmap(kpte);
>   
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index e78832dce7bb..1f86a88fd4bb 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift);
>   
>   #ifndef __PAGETABLE_PUD_FOLDED
>   /* 4 level page table */
> -struct page *pgd_page(pgd_t pgd)
> +struct page *p4d_page(p4d_t p4d)
>   {
> -	if (pgd_is_leaf(pgd)) {
> -		VM_WARN_ON(!pgd_huge(pgd));
> -		return pte_page(pgd_pte(pgd));
> +	if (p4d_is_leaf(p4d)) {
> +		VM_WARN_ON(!p4d_huge(p4d));
> +		return pte_page(p4d_pte(p4d));
>   	}
> -	return virt_to_page(pgd_page_vaddr(pgd));
> +	return virt_to_page(p4d_page_vaddr(p4d));
>   }
>   #endif
>   
> diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
> index a07278027c6f..ac360ad865a8 100644
> --- a/arch/powerpc/mm/ptdump/hashpagetable.c
> +++ b/arch/powerpc/mm/ptdump/hashpagetable.c
> @@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
>   	}
>   }
>   
> -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
>   {
> -	pud_t *pud = pud_offset(pgd, 0);
> +	pud_t *pud = pud_offset(p4d, 0);
>   	unsigned long addr;
>   	unsigned int i;
>   
> @@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
>   	}
>   }
>   
> +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +	p4d_t *p4d = p4d_offset(pgd, 0);
> +	unsigned long addr;
> +	unsigned int i;
> +
> +	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
> +		addr = start + i * P4D_SIZE;
> +		if (!p4d_none(*p4d))
> +			/* p4d exists */
> +			walk_pud(st, p4d, addr);
> +	}
> +}
> +
>   static void walk_pagetables(struct pg_state *st)
>   {
>   	pgd_t *pgd = pgd_offset_k(0UL);
> @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
>   		addr = KERN_VIRT_START + i * PGDIR_SIZE;
>   		if (!pgd_none(*pgd))
>   			/* pgd exists */
> -			walk_pud(st, pgd, addr);
> +			walk_p4d(st, pgd, addr);
>   	}
>   }
>   
> diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
> index 206156255247..9d6256b61df3 100644
> --- a/arch/powerpc/mm/ptdump/ptdump.c
> +++ b/arch/powerpc/mm/ptdump/ptdump.c
> @@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
>   	}
>   }
>   
> -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
>   {
> -	pud_t *pud = pud_offset(pgd, 0);
> +	pud_t *pud = pud_offset(p4d, 0);
>   	unsigned long addr;
>   	unsigned int i;
>   
> @@ -304,11 +304,13 @@ static void walk_pagetables(struct pg_state *st)
>   	 * the hash pagetable.
>   	 */
>   	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
> -		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
> -			/* pgd exists */
> -			walk_pud(st, pgd, addr);
> +		p4d_t *p4d = p4d_offset(pgd, 0);
> +
> +		if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
> +			/* p4d exists */
> +			walk_pud(st, p4d, addr);
>   		else
> -			note_page(st, addr, 1, pgd_val(*pgd));
> +			note_page(st, addr, 1, p4d_val(*p4d));
>   	}
>   }
>   
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 0ec9640335bb..3e29128c58cc 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -3130,6 +3130,7 @@ static void show_pte(unsigned long addr)
>   	struct task_struct *tsk = NULL;
>   	struct mm_struct *mm;
>   	pgd_t *pgdp, *pgdir;
> +	p4d_t *p4dp;
>   	pud_t *pudp;
>   	pmd_t *pmdp;
>   	pte_t *ptep;
> @@ -3161,20 +3162,21 @@ static void show_pte(unsigned long addr)
>   		pgdir = pgd_offset(mm, 0);
>   	}
>   
> -	if (pgd_none(*pgdp)) {
> -		printf("no linux page table for address\n");
> +	p4dp = p4d_offset(pgdp, addr);
> +
> +	if (p4d_none(*p4dp)) {
> +		printf("No valid P4D\n");
>   		return;
>   	}
>   
> -	printf("pgd  @ 0x%px\n", pgdir);
> -
> -	if (pgd_is_leaf(*pgdp)) {
> -		format_pte(pgdp, pgd_val(*pgdp));
> +	if (p4d_is_leaf(*p4dp)) {
> +		format_pte(p4dp, p4d_val(*p4dp));
>   		return;
>   	}
> -	printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp));
>   
> -	pudp = pud_offset(pgdp, addr);
> +	printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp));
> +
> +	pudp = pud_offset(p4dp, addr);
>   
>   	if (pud_none(*pudp)) {
>   		printf("No valid PUD\n");
>
Mike Rapoport Feb. 26, 2020, 10:56 a.m. UTC | #7
On Wed, Feb 26, 2020 at 10:46:13AM +0100, Christophe Leroy wrote:
> 
> 
> Le 26/02/2020 à 10:13, Mike Rapoport a écrit :
> > On Tue, Feb 18, 2020 at 12:54:40PM +0200, Mike Rapoport wrote:
> > > On Sun, Feb 16, 2020 at 11:41:07AM +0100, Christophe Leroy wrote:
> > > > 
> > > > 
> > > > Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
> > > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > > 
> > > > > Implement primitives necessary for the 4th level folding, add walks of p4d
> > > > > level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
> > > > 
> > > > I don't think it is worth adding all this additionnals walks of p4d, this
> > > > patch could be limited to changes like:
> > > > 
> > > > -		pud = pud_offset(pgd, gpa);
> > > > +		pud = pud_offset(p4d_offset(pgd, gpa), gpa);
> > > > 
> > > > The additionnal walks should be added through another patch the day powerpc
> > > > need them.
> > > 
> > > Ok, I'll update the patch to reduce walking the p4d.
> > 
> > Here's what I have with more direct acceses from pgd to pud.
> 
> I went quickly through. This looks promising.
> 
> Do we need the walk_p4d() in arch/powerpc/mm/ptdump/hashpagetable.c ?
> Can't we just do
> 
> @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
>  		addr = KERN_VIRT_START + i * PGDIR_SIZE;
>  		if (!pgd_none(*pgd))
>  			/* pgd exists */
> -			walk_pud(st, pgd, addr);
> +			walk_pud(st, p4d_offset(pgd, addr), addr);

We can do

	addr = KERN_VIRT_START + i * PGDIR_SIZE;
	p4d = p4d_offset(pgd, addr);
	if (!p4d_none(*pgd))
		walk_pud()

But I don't think this is really essential. Again, we are trading off code
consistency vs line count. I don't think line count is that important.

>  	}
>  }
> 
> 
> 
> Also, I think the removal of get_pteptr() should be a patch by itself.
> 
> You could include my patches in your series. See
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=152102
> 
> Christophe
> 
> 
> 
> > 
> >  From 6c59a86ce8394fb6100e9b6ced2e346981fb0ce9 Mon Sep 17 00:00:00 2001
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > Date: Sun, 24 Nov 2019 15:38:00 +0200
> > Subject: [PATCH v3] powerpc: add support for folded p4d page tables
> > 
> > Implement primitives necessary for the 4th level folding, add walks of p4d
> > level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
> > ---
> > v3:
> > * reduce amount of added p4d walks
> > * kill pgtable_32::get_pteptr and traverse page table in
> >    pgtable_32::__change_page_attr_noflush
> > 
> > 
> >   arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
> >   arch/powerpc/include/asm/book3s/64/hash.h     |  4 +-
> >   arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
> >   arch/powerpc/include/asm/book3s/64/pgtable.h  | 60 ++++++++++---------
> >   arch/powerpc/include/asm/book3s/64/radix.h    |  6 +-
> >   arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
> >   arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
> >   .../include/asm/nohash/64/pgtable-4k.h        | 32 +++++-----
> >   arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
> >   arch/powerpc/include/asm/pgtable.h            |  6 +-
> >   arch/powerpc/kvm/book3s_64_mmu_radix.c        | 30 ++++++----
> >   arch/powerpc/lib/code-patching.c              |  7 ++-
> >   arch/powerpc/mm/book3s32/mmu.c                |  2 +-
> >   arch/powerpc/mm/book3s32/tlb.c                |  4 +-
> >   arch/powerpc/mm/book3s64/hash_pgtable.c       |  4 +-
> >   arch/powerpc/mm/book3s64/radix_pgtable.c      | 26 +++++---
> >   arch/powerpc/mm/book3s64/subpage_prot.c       |  6 +-
> >   arch/powerpc/mm/hugetlbpage.c                 | 28 +++++----
> >   arch/powerpc/mm/kasan/kasan_init_32.c         |  8 +--
> >   arch/powerpc/mm/mem.c                         |  4 +-
> >   arch/powerpc/mm/nohash/40x.c                  |  4 +-
> >   arch/powerpc/mm/nohash/book3e_pgtable.c       | 15 ++---
> >   arch/powerpc/mm/pgtable.c                     | 30 ++++++----
> >   arch/powerpc/mm/pgtable_32.c                  | 45 +++-----------
> >   arch/powerpc/mm/pgtable_64.c                  | 10 ++--
> >   arch/powerpc/mm/ptdump/hashpagetable.c        | 20 ++++++-
> >   arch/powerpc/mm/ptdump/ptdump.c               | 14 +++--
> >   arch/powerpc/xmon/xmon.c                      | 18 +++---
> >   28 files changed, 213 insertions(+), 184 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
> > index 5b39c11e884a..39ec11371be0 100644
> > --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> > +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> > @@ -2,7 +2,6 @@
> >   #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
> >   #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
> > -#define __ARCH_USE_5LEVEL_HACK
> >   #include <asm-generic/pgtable-nopmd.h>
> >   #include <asm/book3s/32/hash.h>
> > diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> > index 2781ebf6add4..876d1528c2cf 100644
> > --- a/arch/powerpc/include/asm/book3s/64/hash.h
> > +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> > @@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea)
> >   #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
> >   #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)
> > -static inline int hash__pgd_bad(pgd_t pgd)
> > +static inline int hash__p4d_bad(p4d_t p4d)
> >   {
> > -	return (pgd_val(pgd) == 0);
> > +	return (p4d_val(p4d) == 0);
> >   }
> >   #ifdef CONFIG_STRICT_KERNEL_RWX
> >   extern void hash__mark_rodata_ro(void);
> > diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> > index a41e91bd0580..69c5b051734f 100644
> > --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
> > +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> > @@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
> >   	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
> >   }
> > -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
> > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
> >   {
> > -	*pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
> > +	*pgd =  __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
> >   }
> >   static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > index 201a69e6a355..fa60e8594b9f 100644
> > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > @@ -2,7 +2,7 @@
> >   #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
> >   #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
> > -#include <asm-generic/5level-fixup.h>
> > +#include <asm-generic/pgtable-nop4d.h>
> >   #ifndef __ASSEMBLY__
> >   #include <linux/mmdebug.h>
> > @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift;
> >   /* Bits to mask out from a PUD to get to the PMD page */
> >   #define PUD_MASKED_BITS		0xc0000000000000ffUL
> >   /* Bits to mask out from a PGD to get to the PUD page */
> > -#define PGD_MASKED_BITS		0xc0000000000000ffUL
> > +#define P4D_MASKED_BITS		0xc0000000000000ffUL
> >   /*
> >    * Used as an indicator for rcu callback functions
> > @@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
> >   	return pte_access_permitted(pud_pte(pud), write);
> >   }
> > -#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
> > +#define __p4d_raw(x)	((p4d_t) { __pgd_raw(x) })
> > +static inline __be64 p4d_raw(p4d_t x)
> > +{
> > +	return pgd_raw(x.pgd);
> > +}
> > +
> > +#define p4d_write(p4d)		pte_write(p4d_pte(p4d))
> > -static inline void pgd_clear(pgd_t *pgdp)
> > +static inline void p4d_clear(p4d_t *p4dp)
> >   {
> > -	*pgdp = __pgd(0);
> > +	*p4dp = __p4d(0);
> >   }
> > -static inline int pgd_none(pgd_t pgd)
> > +static inline int p4d_none(p4d_t p4d)
> >   {
> > -	return !pgd_raw(pgd);
> > +	return !p4d_raw(p4d);
> >   }
> > -static inline int pgd_present(pgd_t pgd)
> > +static inline int p4d_present(p4d_t p4d)
> >   {
> > -	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
> > +	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT));
> >   }
> > -static inline pte_t pgd_pte(pgd_t pgd)
> > +static inline pte_t p4d_pte(p4d_t p4d)
> >   {
> > -	return __pte_raw(pgd_raw(pgd));
> > +	return __pte_raw(p4d_raw(p4d));
> >   }
> > -static inline pgd_t pte_pgd(pte_t pte)
> > +static inline p4d_t pte_p4d(pte_t pte)
> >   {
> > -	return __pgd_raw(pte_raw(pte));
> > +	return __p4d_raw(pte_raw(pte));
> >   }
> > -static inline int pgd_bad(pgd_t pgd)
> > +static inline int p4d_bad(p4d_t p4d)
> >   {
> >   	if (radix_enabled())
> > -		return radix__pgd_bad(pgd);
> > -	return hash__pgd_bad(pgd);
> > +		return radix__p4d_bad(p4d);
> > +	return hash__p4d_bad(p4d);
> >   }
> > -#define pgd_access_permitted pgd_access_permitted
> > -static inline bool pgd_access_permitted(pgd_t pgd, bool write)
> > +#define p4d_access_permitted p4d_access_permitted
> > +static inline bool p4d_access_permitted(p4d_t p4d, bool write)
> >   {
> > -	return pte_access_permitted(pgd_pte(pgd), write);
> > +	return pte_access_permitted(p4d_pte(p4d), write);
> >   }
> > -extern struct page *pgd_page(pgd_t pgd);
> > +extern struct page *p4d_page(p4d_t p4d);
> >   /* Pointers in the page table tree are physical addresses */
> >   #define __pgtable_ptr_val(ptr)	__pa(ptr)
> >   #define pmd_page_vaddr(pmd)	__va(pmd_val(pmd) & ~PMD_MASKED_BITS)
> >   #define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
> > -#define pgd_page_vaddr(pgd)	__va(pgd_val(pgd) & ~PGD_MASKED_BITS)
> > +#define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
> >   #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
> >   #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
> > @@ -1010,8 +1016,8 @@ extern struct page *pgd_page(pgd_t pgd);
> >   #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
> > -#define pud_offset(pgdp, addr)	\
> > -	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
> > +#define pud_offset(p4dp, addr)	\
> > +	(((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr))
> >   #define pmd_offset(pudp,addr) \
> >   	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
> >   #define pte_offset_kernel(dir,addr) \
> > @@ -1368,11 +1374,11 @@ static inline bool pud_is_leaf(pud_t pud)
> >   	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
> >   }
> > -#define pgd_is_leaf pgd_is_leaf
> > -#define pgd_leaf pgd_is_leaf
> > -static inline bool pgd_is_leaf(pgd_t pgd)
> > +#define p4d_is_leaf p4d_is_leaf
> > +#define p4d_leaf p4d_is_leaf
> > +static inline bool p4d_is_leaf(p4d_t p4d)
> >   {
> > -	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
> > +	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE));
> >   }
> >   #endif /* __ASSEMBLY__ */
> > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> > index d97db3ad9aae..9bca2ac64220 100644
> > --- a/arch/powerpc/include/asm/book3s/64/radix.h
> > +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> > @@ -30,7 +30,7 @@
> >   /* Don't have anything in the reserved bits and leaf bits */
> >   #define RADIX_PMD_BAD_BITS		0x60000000000000e0UL
> >   #define RADIX_PUD_BAD_BITS		0x60000000000000e0UL
> > -#define RADIX_PGD_BAD_BITS		0x60000000000000e0UL
> > +#define RADIX_P4D_BAD_BITS		0x60000000000000e0UL
> >   #define RADIX_PMD_SHIFT		(PAGE_SHIFT + RADIX_PTE_INDEX_SIZE)
> >   #define RADIX_PUD_SHIFT		(RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE)
> > @@ -227,9 +227,9 @@ static inline int radix__pud_bad(pud_t pud)
> >   }
> > -static inline int radix__pgd_bad(pgd_t pgd)
> > +static inline int radix__p4d_bad(p4d_t p4d)
> >   {
> > -	return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
> > +	return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS);
> >   }
> >   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
> > index 60c4d829152e..d4c2c4259fa3 100644
> > --- a/arch/powerpc/include/asm/nohash/32/pgtable.h
> > +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
> > @@ -2,7 +2,6 @@
> >   #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H
> >   #define _ASM_POWERPC_NOHASH_32_PGTABLE_H
> > -#define __ARCH_USE_5LEVEL_HACK
> >   #include <asm-generic/pgtable-nopmd.h>
> >   #ifndef __ASSEMBLY__
> > diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
> > index b9534a793293..668aee6017e7 100644
> > --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
> > +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
> > @@ -15,7 +15,7 @@ struct vmemmap_backing {
> >   };
> >   extern struct vmemmap_backing *vmemmap_list;
> > -#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
> > +#define p4d_populate(MM, P4D, PUD)	p4d_set(P4D, (unsigned long)PUD)
> >   static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> >   {
> > diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
> > index c40ec32b8194..81b1c54e3cf1 100644
> > --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
> > +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
> > @@ -2,7 +2,7 @@
> >   #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
> >   #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
> > -#include <asm-generic/5level-fixup.h>
> > +#include <asm-generic/pgtable-nop4d.h>
> >   /*
> >    * Entries per page directory level.  The PTE level must use a 64b record
> > @@ -45,41 +45,41 @@
> >   #define PMD_MASKED_BITS		0
> >   /* Bits to mask out from a PUD to get to the PMD page */
> >   #define PUD_MASKED_BITS		0
> > -/* Bits to mask out from a PGD to get to the PUD page */
> > -#define PGD_MASKED_BITS		0
> > +/* Bits to mask out from a P4D to get to the PUD page */
> > +#define P4D_MASKED_BITS		0
> >   /*
> >    * 4-level page tables related bits
> >    */
> > -#define pgd_none(pgd)		(!pgd_val(pgd))
> > -#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
> > -#define pgd_present(pgd)	(pgd_val(pgd) != 0)
> > -#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
> > +#define p4d_none(p4d)		(!p4d_val(p4d))
> > +#define p4d_bad(p4d)		(p4d_val(p4d) == 0)
> > +#define p4d_present(p4d)	(p4d_val(p4d) != 0)
> > +#define p4d_page_vaddr(p4d)	(p4d_val(p4d) & ~P4D_MASKED_BITS)
> >   #ifndef __ASSEMBLY__
> > -static inline void pgd_clear(pgd_t *pgdp)
> > +static inline void p4d_clear(p4d_t *p4dp)
> >   {
> > -	*pgdp = __pgd(0);
> > +	*p4dp = __p4d(0);
> >   }
> > -static inline pte_t pgd_pte(pgd_t pgd)
> > +static inline pte_t p4d_pte(p4d_t p4d)
> >   {
> > -	return __pte(pgd_val(pgd));
> > +	return __pte(p4d_val(p4d));
> >   }
> > -static inline pgd_t pte_pgd(pte_t pte)
> > +static inline p4d_t pte_p4d(pte_t pte)
> >   {
> > -	return __pgd(pte_val(pte));
> > +	return __p4d(pte_val(pte));
> >   }
> > -extern struct page *pgd_page(pgd_t pgd);
> > +extern struct page *p4d_page(p4d_t p4d);
> >   #endif /* !__ASSEMBLY__ */
> > -#define pud_offset(pgdp, addr)	\
> > -  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
> > +#define pud_offset(p4dp, addr)	\
> > +  (((pud_t *) p4d_page_vaddr(*(p4dp))) + \
> >       (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
> >   #define pud_ERROR(e) \
> > diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
> > index 9a33b8bd842d..b360f262b9c6 100644
> > --- a/arch/powerpc/include/asm/nohash/64/pgtable.h
> > +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
> > @@ -175,11 +175,11 @@ static inline pud_t pte_pud(pte_t pte)
> >   	return __pud(pte_val(pte));
> >   }
> >   #define pud_write(pud)		pte_write(pud_pte(pud))
> > -#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
> > +#define p4d_write(pgd)		pte_write(p4d_pte(p4d))
> > -static inline void pgd_set(pgd_t *pgdp, unsigned long val)
> > +static inline void p4d_set(p4d_t *p4dp, unsigned long val)
> >   {
> > -	*pgdp = __pgd(val);
> > +	*p4dp = __p4d(val);
> >   }
> >   /*
> > diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> > index 8cc543ed114c..05205d7a7b4a 100644
> > --- a/arch/powerpc/include/asm/pgtable.h
> > +++ b/arch/powerpc/include/asm/pgtable.h
> > @@ -139,9 +139,9 @@ static inline bool pud_is_leaf(pud_t pud)
> >   }
> >   #endif
> > -#ifndef pgd_is_leaf
> > -#define pgd_is_leaf pgd_is_leaf
> > -static inline bool pgd_is_leaf(pgd_t pgd)
> > +#ifndef p4d_is_leaf
> > +#define p4d_is_leaf p4d_is_leaf
> > +static inline bool p4d_is_leaf(p4d_t p4d)
> >   {
> >   	return false;
> >   }
> > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > index 803940d79b73..beb694285100 100644
> > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > @@ -499,13 +499,14 @@ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
> >   	unsigned long ig;
> >   	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
> > +		p4d_t *p4d = p4d_offset(pgd, 0);
> >   		pud_t *pud;
> > -		if (!pgd_present(*pgd))
> > +		if (!p4d_present(*p4d))
> >   			continue;
> > -		pud = pud_offset(pgd, 0);
> > +		pud = pud_offset(p4d, 0);
> >   		kvmppc_unmap_free_pud(kvm, pud, lpid);
> > -		pgd_clear(pgd);
> > +		p4d_clear(p4d);
> >   	}
> >   }
> > @@ -566,6 +567,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> >   		      unsigned long *rmapp, struct rmap_nested **n_rmap)
> >   {
> >   	pgd_t *pgd;
> > +	p4d_t *p4d;
> >   	pud_t *pud, *new_pud = NULL;
> >   	pmd_t *pmd, *new_pmd = NULL;
> >   	pte_t *ptep, *new_ptep = NULL;
> > @@ -573,9 +575,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> >   	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
> >   	pgd = pgtable + pgd_index(gpa);
> > +	p4d = p4d_offset(pgd, gpa);
> > +
> >   	pud = NULL;
> > -	if (pgd_present(*pgd))
> > -		pud = pud_offset(pgd, gpa);
> > +	if (p4d_present(*p4d))
> > +		pud = pud_offset(p4d, gpa);
> >   	else
> >   		new_pud = pud_alloc_one(kvm->mm, gpa);
> > @@ -596,13 +600,13 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> >   	/* Now traverse again under the lock and change the tree */
> >   	ret = -ENOMEM;
> > -	if (pgd_none(*pgd)) {
> > +	if (p4d_none(*p4d)) {
> >   		if (!new_pud)
> >   			goto out_unlock;
> > -		pgd_populate(kvm->mm, pgd, new_pud);
> > +		p4d_populate(kvm->mm, p4d, new_pud);
> >   		new_pud = NULL;
> >   	}
> > -	pud = pud_offset(pgd, gpa);
> > +	pud = pud_offset(p4d, gpa);
> >   	if (pud_is_leaf(*pud)) {
> >   		unsigned long hgpa = gpa & PUD_MASK;
> > @@ -1220,6 +1224,7 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
> >   	pgd_t *pgt;
> >   	struct kvm_nested_guest *nested;
> >   	pgd_t pgd, *pgdp;
> > +	p4d_t p4d, *p4dp;
> >   	pud_t pud, *pudp;
> >   	pmd_t pmd, *pmdp;
> >   	pte_t *ptep;
> > @@ -1292,13 +1297,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
> >   		}
> >   		pgdp = pgt + pgd_index(gpa);
> > -		pgd = READ_ONCE(*pgdp);
> > -		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
> > -			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
> > +		p4dp = p4d_offset(pgdp, gpa);
> > +		p4d = READ_ONCE(*p4dp);
> > +		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
> > +			gpa = (gpa & P4D_MASK) + P4D_SIZE;
> >   			continue;
> >   		}
> > -		pudp = pud_offset(&pgd, gpa);
> > +		pudp = pud_offset(&p4d, gpa);
> >   		pud = READ_ONCE(*pudp);
> >   		if (!(pud_val(pud) & _PAGE_PRESENT)) {
> >   			gpa = (gpa & PUD_MASK) + PUD_SIZE;
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 3345f039a876..7a59f6863cec 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr)
> >   	pte_t *ptep;
> >   	pmd_t *pmdp;
> >   	pud_t *pudp;
> > +	p4d_t *p4dp;
> >   	pgd_t *pgdp;
> >   	pgdp = pgd_offset_k(addr);
> >   	if (unlikely(!pgdp))
> >   		return -EINVAL;
> > -	pudp = pud_offset(pgdp, addr);
> > +	p4dp = p4d_offset(pgdp, addr);
> > +	if (unlikely(!p4dp))
> > +		return -EINVAL;
> > +
> > +	pudp = pud_offset(p4dp, addr);
> >   	if (unlikely(!pudp))
> >   		return -EINVAL;
> > diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> > index f888cbb109b9..edef17c97206 100644
> > --- a/arch/powerpc/mm/book3s32/mmu.c
> > +++ b/arch/powerpc/mm/book3s32/mmu.c
> > @@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
> >   	if (!Hash)
> >   		return;
> > -	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);
> >   	if (!pmd_none(*pmd))
> >   		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
> >   }
> > diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
> > index 2fcd321040ff..175bc33b41b7 100644
> > --- a/arch/powerpc/mm/book3s32/tlb.c
> > +++ b/arch/powerpc/mm/book3s32/tlb.c
> > @@ -87,7 +87,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
> >   	if (start >= end)
> >   		return;
> >   	end = (end - 1) | ~PAGE_MASK;
> > -	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start);
> >   	for (;;) {
> >   		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
> >   		if (pmd_end > end)
> > @@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
> >   		return;
> >   	}
> >   	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
> > -	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr);
> >   	if (!pmd_none(*pmd))
> >   		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
> >   }
> > diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> > index 64733b9cb20a..9cd15937e88a 100644
> > --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> > +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> > @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
> >   int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   {
> >   	pgd_t *pgdp;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> > @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
> >   	if (slab_is_available()) {
> >   		pgdp = pgd_offset_k(ea);
> > -		pudp = pud_alloc(&init_mm, pgdp, ea);
> > +		p4dp = p4d_offset(pgdp, ea);
> > +		pudp = pud_alloc(&init_mm, p4dp, ea);
> >   		if (!pudp)
> >   			return -ENOMEM;
> >   		pmdp = pmd_alloc(&init_mm, pudp, ea);
> > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> > index dd1bea45325c..fc3d0b0460b0 100644
> > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> > @@ -64,17 +64,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
> >   {
> >   	unsigned long pfn = pa >> PAGE_SHIFT;
> >   	pgd_t *pgdp;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> >   	pgdp = pgd_offset_k(ea);
> > -	if (pgd_none(*pgdp)) {
> > +	p4dp = p4d_offset(pgdp, ea);
> > +	if (p4d_none(*p4dp)) {
> >   		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
> >   						region_start, region_end);
> > -		pgd_populate(&init_mm, pgdp, pudp);
> > +		p4d_populate(&init_mm, p4dp, pudp);
> >   	}
> > -	pudp = pud_offset(pgdp, ea);
> > +	pudp = pud_offset(p4dp, ea);
> >   	if (map_page_size == PUD_SIZE) {
> >   		ptep = (pte_t *)pudp;
> >   		goto set_the_pte;
> > @@ -114,6 +116,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
> >   {
> >   	unsigned long pfn = pa >> PAGE_SHIFT;
> >   	pgd_t *pgdp;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> > @@ -136,7 +139,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
> >   	 * boot.
> >   	 */
> >   	pgdp = pgd_offset_k(ea);
> > -	pudp = pud_alloc(&init_mm, pgdp, ea);
> > +	p4dp = p4d_offset(pgdp, ea);
> > +	pudp = pud_alloc(&init_mm, p4dp, ea);
> >   	if (!pudp)
> >   		return -ENOMEM;
> >   	if (map_page_size == PUD_SIZE) {
> > @@ -173,6 +177,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
> >   {
> >   	unsigned long idx;
> >   	pgd_t *pgdp;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> > @@ -185,7 +190,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
> >   	for (idx = start; idx < end; idx += PAGE_SIZE) {
> >   		pgdp = pgd_offset_k(idx);
> > -		pudp = pud_alloc(&init_mm, pgdp, idx);
> > +		p4dp = p4d_offset(pgdp, idx);
> > +		pudp = pud_alloc(&init_mm, p4dp, idx);
> >   		if (!pudp)
> >   			continue;
> >   		if (pud_is_leaf(*pudp)) {
> > @@ -847,6 +853,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
> >   	unsigned long addr, next;
> >   	pud_t *pud_base;
> >   	pgd_t *pgd;
> > +	p4d_t *p4d;
> >   	spin_lock(&init_mm.page_table_lock);
> > @@ -854,15 +861,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
> >   		next = pgd_addr_end(addr, end);
> >   		pgd = pgd_offset_k(addr);
> > -		if (!pgd_present(*pgd))
> > +		p4d = p4d_offset(pgd, addr);
> > +		if (!p4d_present(*p4d))
> >   			continue;
> > -		if (pgd_is_leaf(*pgd)) {
> > -			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
> > +		if (p4d_is_leaf(*p4d)) {
> > +			split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d);
> >   			continue;
> >   		}
> > -		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
> > +		pud_base = (pud_t *)p4d_page_vaddr(*p4d);
> >   		remove_pud_table(pud_base, addr, next);
> >   	}
> > diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
> > index 2ef24a53f4c9..25a0c044bd93 100644
> > --- a/arch/powerpc/mm/book3s64/subpage_prot.c
> > +++ b/arch/powerpc/mm/book3s64/subpage_prot.c
> > @@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
> >   			     int npages)
> >   {
> >   	pgd_t *pgd;
> > +	p4d_t *p4d;
> >   	pud_t *pud;
> >   	pmd_t *pmd;
> >   	pte_t *pte;
> >   	spinlock_t *ptl;
> >   	pgd = pgd_offset(mm, addr);
> > -	if (pgd_none(*pgd))
> > +	p4d = p4d_offset(pgd, addr);
> > +	if (p4d_none(*p4d))
> >   		return;
> > -	pud = pud_offset(pgd, addr);
> > +	pud = pud_offset(p4d, addr);
> >   	if (pud_none(*pud))
> >   		return;
> >   	pmd = pmd_offset(pud, addr);
> > diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> > index 33b3461d91e8..54f5994d4cbb 100644
> > --- a/arch/powerpc/mm/hugetlbpage.c
> > +++ b/arch/powerpc/mm/hugetlbpage.c
> > @@ -119,6 +119,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
> >   pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
> >   {
> >   	pgd_t *pg;
> > +	p4d_t *p4;
> >   	pud_t *pu;
> >   	pmd_t *pm;
> >   	hugepd_t *hpdp = NULL;
> > @@ -128,20 +129,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
> >   	addr &= ~(sz-1);
> >   	pg = pgd_offset(mm, addr);
> > +	p4 = p4d_offset(pg, addr);
> >   #ifdef CONFIG_PPC_BOOK3S_64
> >   	if (pshift == PGDIR_SHIFT)
> >   		/* 16GB huge page */
> > -		return (pte_t *) pg;
> > +		return (pte_t *) p4;
> >   	else if (pshift > PUD_SHIFT) {
> >   		/*
> >   		 * We need to use hugepd table
> >   		 */
> >   		ptl = &mm->page_table_lock;
> > -		hpdp = (hugepd_t *)pg;
> > +		hpdp = (hugepd_t *)p4;
> >   	} else {
> >   		pdshift = PUD_SHIFT;
> > -		pu = pud_alloc(mm, pg, addr);
> > +		pu = pud_alloc(mm, p4, addr);
> >   		if (!pu)
> >   			return NULL;
> >   		if (pshift == PUD_SHIFT)
> > @@ -166,10 +168,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
> >   #else
> >   	if (pshift >= PGDIR_SHIFT) {
> >   		ptl = &mm->page_table_lock;
> > -		hpdp = (hugepd_t *)pg;
> > +		hpdp = (hugepd_t *)p4;
> >   	} else {
> >   		pdshift = PUD_SHIFT;
> > -		pu = pud_alloc(mm, pg, addr);
> > +		pu = pud_alloc(mm, p4, addr);
> >   		if (!pu)
> >   			return NULL;
> >   		if (pshift >= PUD_SHIFT) {
> > @@ -390,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
> >   	mm_dec_nr_pmds(tlb->mm);
> >   }
> > -static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
> > +static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
> >   				   unsigned long addr, unsigned long end,
> >   				   unsigned long floor, unsigned long ceiling)
> >   {
> > @@ -400,7 +402,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
> >   	start = addr;
> >   	do {
> > -		pud = pud_offset(pgd, addr);
> > +		pud = pud_offset(p4d, addr);
> >   		next = pud_addr_end(addr, end);
> >   		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
> >   			if (pud_none_or_clear_bad(pud))
> > @@ -435,8 +437,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
> >   	if (end - 1 > ceiling - 1)
> >   		return;
> > -	pud = pud_offset(pgd, start);
> > -	pgd_clear(pgd);
> > +	pud = pud_offset(p4d, start);
> > +	p4d_clear(p4d);
> >   	pud_free_tlb(tlb, pud, start);
> >   	mm_dec_nr_puds(tlb->mm);
> >   }
> > @@ -449,6 +451,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> >   			    unsigned long floor, unsigned long ceiling)
> >   {
> >   	pgd_t *pgd;
> > +	p4d_t *p4d;
> >   	unsigned long next;
> >   	/*
> > @@ -471,10 +474,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> >   	do {
> >   		next = pgd_addr_end(addr, end);
> >   		pgd = pgd_offset(tlb->mm, addr);
> > +		p4d = p4d_offset(pgd, addr);
> >   		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
> > -			if (pgd_none_or_clear_bad(pgd))
> > +			if (p4d_none_or_clear_bad(p4d))
> >   				continue;
> > -			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
> > +			hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
> >   		} else {
> >   			unsigned long more;
> >   			/*
> > @@ -487,7 +491,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> >   			if (more > next)
> >   				next = more;
> > -			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
> > +			free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
> >   					  addr, next, floor, ceiling);
> >   		}
> >   	} while (addr = next, addr != end);
> > diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
> > index db5664dde5ff..88e2e16380b5 100644
> > --- a/arch/powerpc/mm/kasan/kasan_init_32.c
> > +++ b/arch/powerpc/mm/kasan/kasan_init_32.c
> > @@ -36,7 +36,7 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned
> >   	unsigned long k_cur, k_next;
> >   	pte_t *new = NULL;
> > -	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_start), k_start), k_start), k_start);
> >   	for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
> >   		k_next = pgd_addr_end(k_cur, k_end);
> > @@ -78,7 +78,7 @@ static int __init kasan_init_region(void *start, size_t size)
> >   	block = memblock_alloc(k_end - k_start, PAGE_SIZE);
> >   	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
> > -		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
> > +		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
> >   		void *va = block + k_cur - k_start;
> >   		pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
> > @@ -102,7 +102,7 @@ static void __init kasan_remap_early_shadow_ro(void)
> >   	kasan_populate_pte(kasan_early_shadow_pte, prot);
> >   	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
> > -		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
> > +		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
> >   		pte_t *ptep = pte_offset_kernel(pmd, k_cur);
> >   		if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
> > @@ -201,7 +201,7 @@ void __init kasan_early_init(void)
> >   	unsigned long addr = KASAN_SHADOW_START;
> >   	unsigned long end = KASAN_SHADOW_END;
> >   	unsigned long next;
> > -	pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
> > +	pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(addr), addr), addr), addr);
> >   	BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
> > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> > index ef7b1119b2e2..8262b384dcf3 100644
> > --- a/arch/powerpc/mm/mem.c
> > +++ b/arch/powerpc/mm/mem.c
> > @@ -69,8 +69,8 @@ EXPORT_SYMBOL(kmap_prot);
> >   static inline pte_t *virt_to_kpte(unsigned long vaddr)
> >   {
> > -	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
> > -			vaddr), vaddr), vaddr);
> > +	return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr),
> > +			vaddr), vaddr), vaddr), vaddr);
> >   }
> >   #endif
> > diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
> > index f348104eb461..7aaf7155e350 100644
> > --- a/arch/powerpc/mm/nohash/40x.c
> > +++ b/arch/powerpc/mm/nohash/40x.c
> > @@ -104,7 +104,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
> >   		pmd_t *pmdp;
> >   		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
> > -		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
> > +		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
> >   		*pmdp++ = __pmd(val);
> >   		*pmdp++ = __pmd(val);
> >   		*pmdp++ = __pmd(val);
> > @@ -119,7 +119,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
> >   		pmd_t *pmdp;
> >   		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
> > -		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
> > +		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
> >   		*pmdp = __pmd(val);
> >   		v += LARGE_PAGE_SIZE_4M;
> > diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
> > index 4637fdd469cf..77884e24281d 100644
> > --- a/arch/powerpc/mm/nohash/book3e_pgtable.c
> > +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
> > @@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size)
> >   int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   {
> >   	pgd_t *pgdp;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> > @@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   	BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
> >   	if (slab_is_available()) {
> >   		pgdp = pgd_offset_k(ea);
> > -		pudp = pud_alloc(&init_mm, pgdp, ea);
> > +		p4dp = p4d_offset(pgdp, ea);
> > +		pudp = pud_alloc(&init_mm, p4dp, ea);
> >   		if (!pudp)
> >   			return -ENOMEM;
> >   		pmdp = pmd_alloc(&init_mm, pudp, ea);
> > @@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
> >   			return -ENOMEM;
> >   	} else {
> >   		pgdp = pgd_offset_k(ea);
> > -#ifndef __PAGETABLE_PUD_FOLDED
> > -		if (pgd_none(*pgdp)) {
> > -			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
> > -			pgd_populate(&init_mm, pgdp, pudp);
> > +		p4dp = p4d_offset(pgdp, ea);
> > +		if (p4d_none(*p4dp)) {
> > +			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
> > +			p4d_populate(&init_mm, p4dp, pmdp);
> >   		}
> > -#endif /* !__PAGETABLE_PUD_FOLDED */
> > -		pudp = pud_offset(pgdp, ea);
> > +		pudp = pud_offset(p4dp, ea);
> >   		if (pud_none(*pudp)) {
> >   			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
> >   			pud_populate(&init_mm, pudp, pmdp);
> > diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> > index e3759b69f81b..c2499271f6c1 100644
> > --- a/arch/powerpc/mm/pgtable.c
> > +++ b/arch/powerpc/mm/pgtable.c
> > @@ -265,6 +265,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> >   void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
> >   {
> >   	pgd_t *pgd;
> > +	p4d_t *p4d;
> >   	pud_t *pud;
> >   	pmd_t *pmd;
> > @@ -272,7 +273,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
> >   		return;
> >   	pgd = mm->pgd + pgd_index(addr);
> >   	BUG_ON(pgd_none(*pgd));
> > -	pud = pud_offset(pgd, addr);
> > +	p4d = p4d_offset(pgd, addr);
> > +	BUG_ON(p4d_none(*p4d));
> > +	pud = pud_offset(p4d, addr);
> >   	BUG_ON(pud_none(*pud));
> >   	pmd = pmd_offset(pud, addr);
> >   	/*
> > @@ -312,12 +315,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys);
> >   pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
> >   			bool *is_thp, unsigned *hpage_shift)
> >   {
> > -	pgd_t pgd, *pgdp;
> > +	pgd_t *pgdp;
> > +	p4d_t p4d, *p4dp;
> >   	pud_t pud, *pudp;
> >   	pmd_t pmd, *pmdp;
> >   	pte_t *ret_pte;
> >   	hugepd_t *hpdp = NULL;
> > -	unsigned pdshift = PGDIR_SHIFT;
> > +	unsigned pdshift;
> >   	if (hpage_shift)
> >   		*hpage_shift = 0;
> > @@ -325,24 +329,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
> >   	if (is_thp)
> >   		*is_thp = false;
> > -	pgdp = pgdir + pgd_index(ea);
> > -	pgd  = READ_ONCE(*pgdp);
> >   	/*
> >   	 * Always operate on the local stack value. This make sure the
> >   	 * value don't get updated by a parallel THP split/collapse,
> >   	 * page fault or a page unmap. The return pte_t * is still not
> >   	 * stable. So should be checked there for above conditions.
> > +	 * Top level is an exception because it is folded into p4d.
> >   	 */
> > -	if (pgd_none(pgd))
> > +	pgdp = pgdir + pgd_index(ea);
> > +	p4dp = p4d_offset(pgdp, ea);
> > +	p4d  = READ_ONCE(*p4dp);
> > +	pdshift = P4D_SHIFT;
> > +
> > +	if (p4d_none(p4d))
> >   		return NULL;
> > -	if (pgd_is_leaf(pgd)) {
> > -		ret_pte = (pte_t *)pgdp;
> > +	if (p4d_is_leaf(p4d)) {
> > +		ret_pte = (pte_t *)p4dp;
> >   		goto out;
> >   	}
> > -	if (is_hugepd(__hugepd(pgd_val(pgd)))) {
> > -		hpdp = (hugepd_t *)&pgd;
> > +	if (is_hugepd(__hugepd(p4d_val(p4d)))) {
> > +		hpdp = (hugepd_t *)&p4d;
> >   		goto out_huge;
> >   	}
> > @@ -352,7 +360,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
> >   	 * irq disabled
> >   	 */
> >   	pdshift = PUD_SHIFT;
> > -	pudp = pud_offset(&pgd, ea);
> > +	pudp = pud_offset(&p4d, ea);
> >   	pud  = READ_ONCE(*pudp);
> >   	if (pud_none(pud))
> > diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> > index 5fb90edd865e..5774d4bc94d0 100644
> > --- a/arch/powerpc/mm/pgtable_32.c
> > +++ b/arch/powerpc/mm/pgtable_32.c
> > @@ -63,7 +63,7 @@ int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
> >   	int err = -ENOMEM;
> >   	/* Use upper 10 bits of VA to index the first level map */
> > -	pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
> > +	pd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
> >   	/* Use middle 10 bits of VA to index the second-level map */
> >   	if (likely(slab_is_available()))
> >   		pg = pte_alloc_kernel(pd, va);
> > @@ -121,53 +121,24 @@ void __init mapin_ram(void)
> >   	}
> >   }
> > -/* Scan the real Linux page tables and return a PTE pointer for
> > - * a virtual address in a context.
> > - * Returns true (1) if PTE was found, zero otherwise.  The pointer to
> > - * the PTE pointer is unmodified if PTE is not found.
> > - */
> > -static int
> > -get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
> > -{
> > -        pgd_t	*pgd;
> > -	pud_t	*pud;
> > -        pmd_t	*pmd;
> > -        pte_t	*pte;
> > -        int     retval = 0;
> > -
> > -        pgd = pgd_offset(mm, addr & PAGE_MASK);
> > -        if (pgd) {
> > -		pud = pud_offset(pgd, addr & PAGE_MASK);
> > -		if (pud && pud_present(*pud)) {
> > -			pmd = pmd_offset(pud, addr & PAGE_MASK);
> > -			if (pmd_present(*pmd)) {
> > -				pte = pte_offset_map(pmd, addr & PAGE_MASK);
> > -				if (pte) {
> > -					retval = 1;
> > -					*ptep = pte;
> > -					if (pmdp)
> > -						*pmdp = pmd;
> > -					/* XXX caller needs to do pte_unmap, yuck */
> > -				}
> > -			}
> > -		}
> > -        }
> > -        return(retval);
> > -}
> > -
> >   static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
> >   {
> >   	pte_t *kpte;
> >   	pmd_t *kpmd;
> > -	unsigned long address;
> > +	unsigned long address, va;
> >   	BUG_ON(PageHighMem(page));
> >   	address = (unsigned long)page_address(page);
> > +	va = address & PAGE_MASK;
> >   	if (v_block_mapped(address))
> >   		return 0;
> > -	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
> > +
> > +	kpmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
> > +	if (!pmd_present(*kpmd))
> >   		return -EINVAL;
> > +
> > +	kpte = pte_offset_map(kpmd, va);
> >   	__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
> >   	pte_unmap(kpte);
> > diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> > index e78832dce7bb..1f86a88fd4bb 100644
> > --- a/arch/powerpc/mm/pgtable_64.c
> > +++ b/arch/powerpc/mm/pgtable_64.c
> > @@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift);
> >   #ifndef __PAGETABLE_PUD_FOLDED
> >   /* 4 level page table */
> > -struct page *pgd_page(pgd_t pgd)
> > +struct page *p4d_page(p4d_t p4d)
> >   {
> > -	if (pgd_is_leaf(pgd)) {
> > -		VM_WARN_ON(!pgd_huge(pgd));
> > -		return pte_page(pgd_pte(pgd));
> > +	if (p4d_is_leaf(p4d)) {
> > +		VM_WARN_ON(!p4d_huge(p4d));
> > +		return pte_page(p4d_pte(p4d));
> >   	}
> > -	return virt_to_page(pgd_page_vaddr(pgd));
> > +	return virt_to_page(p4d_page_vaddr(p4d));
> >   }
> >   #endif
> > diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
> > index a07278027c6f..ac360ad865a8 100644
> > --- a/arch/powerpc/mm/ptdump/hashpagetable.c
> > +++ b/arch/powerpc/mm/ptdump/hashpagetable.c
> > @@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
> >   	}
> >   }
> > -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> > +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
> >   {
> > -	pud_t *pud = pud_offset(pgd, 0);
> > +	pud_t *pud = pud_offset(p4d, 0);
> >   	unsigned long addr;
> >   	unsigned int i;
> > @@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> >   	}
> >   }
> > +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
> > +{
> > +	p4d_t *p4d = p4d_offset(pgd, 0);
> > +	unsigned long addr;
> > +	unsigned int i;
> > +
> > +	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
> > +		addr = start + i * P4D_SIZE;
> > +		if (!p4d_none(*p4d))
> > +			/* p4d exists */
> > +			walk_pud(st, p4d, addr);
> > +	}
> > +}
> > +
> >   static void walk_pagetables(struct pg_state *st)
> >   {
> >   	pgd_t *pgd = pgd_offset_k(0UL);
> > @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
> >   		addr = KERN_VIRT_START + i * PGDIR_SIZE;
> >   		if (!pgd_none(*pgd))
> >   			/* pgd exists */
> > -			walk_pud(st, pgd, addr);
> > +			walk_p4d(st, pgd, addr);
> >   	}
> >   }
> > diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
> > index 206156255247..9d6256b61df3 100644
> > --- a/arch/powerpc/mm/ptdump/ptdump.c
> > +++ b/arch/powerpc/mm/ptdump/ptdump.c
> > @@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
> >   	}
> >   }
> > -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> > +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
> >   {
> > -	pud_t *pud = pud_offset(pgd, 0);
> > +	pud_t *pud = pud_offset(p4d, 0);
> >   	unsigned long addr;
> >   	unsigned int i;
> > @@ -304,11 +304,13 @@ static void walk_pagetables(struct pg_state *st)
> >   	 * the hash pagetable.
> >   	 */
> >   	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
> > -		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
> > -			/* pgd exists */
> > -			walk_pud(st, pgd, addr);
> > +		p4d_t *p4d = p4d_offset(pgd, 0);
> > +
> > +		if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
> > +			/* p4d exists */
> > +			walk_pud(st, p4d, addr);
> >   		else
> > -			note_page(st, addr, 1, pgd_val(*pgd));
> > +			note_page(st, addr, 1, p4d_val(*p4d));
> >   	}
> >   }
> > diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> > index 0ec9640335bb..3e29128c58cc 100644
> > --- a/arch/powerpc/xmon/xmon.c
> > +++ b/arch/powerpc/xmon/xmon.c
> > @@ -3130,6 +3130,7 @@ static void show_pte(unsigned long addr)
> >   	struct task_struct *tsk = NULL;
> >   	struct mm_struct *mm;
> >   	pgd_t *pgdp, *pgdir;
> > +	p4d_t *p4dp;
> >   	pud_t *pudp;
> >   	pmd_t *pmdp;
> >   	pte_t *ptep;
> > @@ -3161,20 +3162,21 @@ static void show_pte(unsigned long addr)
> >   		pgdir = pgd_offset(mm, 0);
> >   	}
> > -	if (pgd_none(*pgdp)) {
> > -		printf("no linux page table for address\n");
> > +	p4dp = p4d_offset(pgdp, addr);
> > +
> > +	if (p4d_none(*p4dp)) {
> > +		printf("No valid P4D\n");
> >   		return;
> >   	}
> > -	printf("pgd  @ 0x%px\n", pgdir);
> > -
> > -	if (pgd_is_leaf(*pgdp)) {
> > -		format_pte(pgdp, pgd_val(*pgdp));
> > +	if (p4d_is_leaf(*p4dp)) {
> > +		format_pte(p4dp, p4d_val(*p4dp));
> >   		return;
> >   	}
> > -	printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp));
> > -	pudp = pud_offset(pgdp, addr);
> > +	printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp));
> > +
> > +	pudp = pud_offset(p4dp, addr);
> >   	if (pud_none(*pudp)) {
> >   		printf("No valid PUD\n");
> > 
> 
>
Christophe Leroy Feb. 26, 2020, 11:20 a.m. UTC | #8
Le 26/02/2020 à 11:56, Mike Rapoport a écrit :
> On Wed, Feb 26, 2020 at 10:46:13AM +0100, Christophe Leroy wrote:
>>
>>
>> Le 26/02/2020 à 10:13, Mike Rapoport a écrit :
>>> On Tue, Feb 18, 2020 at 12:54:40PM +0200, Mike Rapoport wrote:
>>>> On Sun, Feb 16, 2020 at 11:41:07AM +0100, Christophe Leroy wrote:
>>>>>
>>>>>
>>>>> Le 16/02/2020 à 09:18, Mike Rapoport a écrit :
>>>>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>>>>
>>>>>> Implement primitives necessary for the 4th level folding, add walks of p4d
>>>>>> level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.
>>>>>
>>>>> I don't think it is worth adding all this additionnals walks of p4d, this
>>>>> patch could be limited to changes like:
>>>>>
>>>>> -		pud = pud_offset(pgd, gpa);
>>>>> +		pud = pud_offset(p4d_offset(pgd, gpa), gpa);
>>>>>
>>>>> The additionnal walks should be added through another patch the day powerpc
>>>>> need them.
>>>>
>>>> Ok, I'll update the patch to reduce walking the p4d.
>>>
>>> Here's what I have with more direct acceses from pgd to pud.
>>
>> I went quickly through. This looks promising.
>>
>> Do we need the walk_p4d() in arch/powerpc/mm/ptdump/hashpagetable.c ?
>> Can't we just do
>>
>> @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
>>   		addr = KERN_VIRT_START + i * PGDIR_SIZE;
>>   		if (!pgd_none(*pgd))
>>   			/* pgd exists */
>> -			walk_pud(st, pgd, addr);
>> +			walk_pud(st, p4d_offset(pgd, addr), addr);
> 
> We can do
> 
> 	addr = KERN_VIRT_START + i * PGDIR_SIZE;
> 	p4d = p4d_offset(pgd, addr);
> 	if (!p4d_none(*pgd))
> 		walk_pud()
> 
> But I don't think this is really essential. Again, we are trading off code
> consistency vs line count. I don't think line count is that important.

Ok.

Christophe
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 5b39c11e884a..39ec11371be0 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -2,7 +2,6 @@ 
 #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
 #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nopmd.h>
 
 #include <asm/book3s/32/hash.h>
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..876d1528c2cf 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -134,9 +134,9 @@  static inline int get_region_id(unsigned long ea)
 
 #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
 #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)
-static inline int hash__pgd_bad(pgd_t pgd)
+static inline int hash__p4d_bad(p4d_t p4d)
 {
-	return (pgd_val(pgd) == 0);
+	return (p4d_val(p4d) == 0);
 }
 #ifdef CONFIG_STRICT_KERNEL_RWX
 extern void hash__mark_rodata_ro(void);
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index a41e91bd0580..69c5b051734f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -85,9 +85,9 @@  static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
 {
-	*pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
+	*pgd =  __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 201a69e6a355..ddddbafff0ab 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -2,7 +2,7 @@ 
 #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
 #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
 
-#include <asm-generic/5level-fixup.h>
+#include <asm-generic/pgtable-nop4d.h>
 
 #ifndef __ASSEMBLY__
 #include <linux/mmdebug.h>
@@ -251,7 +251,7 @@  extern unsigned long __pmd_frag_size_shift;
 /* Bits to mask out from a PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0xc0000000000000ffUL
 /* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS		0xc0000000000000ffUL
+#define P4D_MASKED_BITS		0xc0000000000000ffUL
 
 /*
  * Used as an indicator for rcu callback functions
@@ -949,54 +949,60 @@  static inline bool pud_access_permitted(pud_t pud, bool write)
 	return pte_access_permitted(pud_pte(pud), write);
 }
 
-#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
+#define __p4d_raw(x)	((p4d_t) { __pgd_raw(x) })
+static inline __be64 p4d_raw(p4d_t x)
+{
+	return pgd_raw(x.pgd);
+}
+
+#define p4d_write(p4d)		pte_write(p4d_pte(p4d))
 
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
 {
-	*pgdp = __pgd(0);
+	*p4dp = __p4d(0);
 }
 
-static inline int pgd_none(pgd_t pgd)
+static inline int p4d_none(p4d_t p4d)
 {
-	return !pgd_raw(pgd);
+	return !p4d_raw(p4d);
 }
 
-static inline int pgd_present(pgd_t pgd)
+static inline int p4d_present(p4d_t p4d)
 {
-	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
+	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT));
 }
 
-static inline pte_t pgd_pte(pgd_t pgd)
+static inline pte_t p4d_pte(p4d_t p4d)
 {
-	return __pte_raw(pgd_raw(pgd));
+	return __pte_raw(p4d_raw(p4d));
 }
 
-static inline pgd_t pte_pgd(pte_t pte)
+static inline p4d_t pte_p4d(pte_t pte)
 {
-	return __pgd_raw(pte_raw(pte));
+	return __p4d_raw(pte_raw(pte));
 }
 
-static inline int pgd_bad(pgd_t pgd)
+static inline int p4d_bad(p4d_t p4d)
 {
 	if (radix_enabled())
-		return radix__pgd_bad(pgd);
-	return hash__pgd_bad(pgd);
+		return radix__p4d_bad(p4d);
+	return hash__p4d_bad(p4d);
 }
 
-#define pgd_access_permitted pgd_access_permitted
-static inline bool pgd_access_permitted(pgd_t pgd, bool write)
+#define p4d_access_permitted p4d_access_permitted
+static inline bool p4d_access_permitted(p4d_t p4d, bool write)
 {
-	return pte_access_permitted(pgd_pte(pgd), write);
+	return pte_access_permitted(p4d_pte(p4d), write);
 }
 
-extern struct page *pgd_page(pgd_t pgd);
+extern struct page *p4d_page(p4d_t p4d);
 
 /* Pointers in the page table tree are physical addresses */
 #define __pgtable_ptr_val(ptr)	__pa(ptr)
 
 #define pmd_page_vaddr(pmd)	__va(pmd_val(pmd) & ~PMD_MASKED_BITS)
 #define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
-#define pgd_page_vaddr(pgd)	__va(pgd_val(pgd) & ~PGD_MASKED_BITS)
+#define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
 
 #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
 #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
@@ -1010,8 +1016,8 @@  extern struct page *pgd_page(pgd_t pgd);
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
-#define pud_offset(pgdp, addr)	\
-	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
+#define pud_offset(p4dp, addr)	\
+	(((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr))
 #define pmd_offset(pudp,addr) \
 	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
 #define pte_offset_kernel(dir,addr) \
@@ -1368,6 +1374,12 @@  static inline bool pud_is_leaf(pud_t pud)
 	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
 }
 
+#define p4d_is_leaf p4d_is_leaf
+static inline bool p4d_is_leaf(p4d_t p4d)
+{
+	return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE));
+}
+
 #define pgd_is_leaf pgd_is_leaf
 #define pgd_leaf pgd_is_leaf
 static inline bool pgd_is_leaf(pgd_t pgd)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..9bca2ac64220 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -30,7 +30,7 @@ 
 /* Don't have anything in the reserved bits and leaf bits */
 #define RADIX_PMD_BAD_BITS		0x60000000000000e0UL
 #define RADIX_PUD_BAD_BITS		0x60000000000000e0UL
-#define RADIX_PGD_BAD_BITS		0x60000000000000e0UL
+#define RADIX_P4D_BAD_BITS		0x60000000000000e0UL
 
 #define RADIX_PMD_SHIFT		(PAGE_SHIFT + RADIX_PTE_INDEX_SIZE)
 #define RADIX_PUD_SHIFT		(RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE)
@@ -227,9 +227,9 @@  static inline int radix__pud_bad(pud_t pud)
 }
 
 
-static inline int radix__pgd_bad(pgd_t pgd)
+static inline int radix__p4d_bad(p4d_t p4d)
 {
-	return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
+	return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 60c4d829152e..d4c2c4259fa3 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -2,7 +2,6 @@ 
 #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H
 #define _ASM_POWERPC_NOHASH_32_PGTABLE_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nopmd.h>
 
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index b9534a793293..668aee6017e7 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -15,7 +15,7 @@  struct vmemmap_backing {
 };
 extern struct vmemmap_backing *vmemmap_list;
 
-#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
+#define p4d_populate(MM, P4D, PUD)	p4d_set(P4D, (unsigned long)PUD)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
index c40ec32b8194..81b1c54e3cf1 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
@@ -2,7 +2,7 @@ 
 #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
 #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H
 
-#include <asm-generic/5level-fixup.h>
+#include <asm-generic/pgtable-nop4d.h>
 
 /*
  * Entries per page directory level.  The PTE level must use a 64b record
@@ -45,41 +45,41 @@ 
 #define PMD_MASKED_BITS		0
 /* Bits to mask out from a PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0
-/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS		0
+/* Bits to mask out from a P4D to get to the PUD page */
+#define P4D_MASKED_BITS		0
 
 
 /*
  * 4-level page tables related bits
  */
 
-#define pgd_none(pgd)		(!pgd_val(pgd))
-#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
-#define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
+#define p4d_none(p4d)		(!p4d_val(p4d))
+#define p4d_bad(p4d)		(p4d_val(p4d) == 0)
+#define p4d_present(p4d)	(p4d_val(p4d) != 0)
+#define p4d_page_vaddr(p4d)	(p4d_val(p4d) & ~P4D_MASKED_BITS)
 
 #ifndef __ASSEMBLY__
 
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
 {
-	*pgdp = __pgd(0);
+	*p4dp = __p4d(0);
 }
 
-static inline pte_t pgd_pte(pgd_t pgd)
+static inline pte_t p4d_pte(p4d_t p4d)
 {
-	return __pte(pgd_val(pgd));
+	return __pte(p4d_val(p4d));
 }
 
-static inline pgd_t pte_pgd(pte_t pte)
+static inline p4d_t pte_p4d(pte_t pte)
 {
-	return __pgd(pte_val(pte));
+	return __p4d(pte_val(pte));
 }
-extern struct page *pgd_page(pgd_t pgd);
+extern struct page *p4d_page(p4d_t p4d);
 
 #endif /* !__ASSEMBLY__ */
 
-#define pud_offset(pgdp, addr)	\
-  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
+#define pud_offset(p4dp, addr)	\
+  (((pud_t *) p4d_page_vaddr(*(p4dp))) + \
     (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
 
 #define pud_ERROR(e) \
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 9a33b8bd842d..b360f262b9c6 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -175,11 +175,11 @@  static inline pud_t pte_pud(pte_t pte)
 	return __pud(pte_val(pte));
 }
 #define pud_write(pud)		pte_write(pud_pte(pud))
-#define pgd_write(pgd)		pte_write(pgd_pte(pgd))
+#define p4d_write(pgd)		pte_write(p4d_pte(p4d))
 
-static inline void pgd_set(pgd_t *pgdp, unsigned long val)
+static inline void p4d_set(p4d_t *p4dp, unsigned long val)
 {
-	*pgdp = __pgd(val);
+	*p4dp = __p4d(val);
 }
 
 /*
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 8cc543ed114c..0a05fddd7881 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -139,6 +139,14 @@  static inline bool pud_is_leaf(pud_t pud)
 }
 #endif
 
+#ifndef p4d_is_leaf
+#define p4d_is_leaf p4d_is_leaf
+static inline bool p4d_is_leaf(p4d_t p4d)
+{
+	return false;
+}
+#endif
+
 #ifndef pgd_is_leaf
 #define pgd_is_leaf pgd_is_leaf
 static inline bool pgd_is_leaf(pgd_t pgd)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 803940d79b73..5aacfa0b27ef 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -494,17 +494,39 @@  static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
 	pud_free(kvm->mm, pud);
 }
 
+static void kvmppc_unmap_free_p4d(struct kvm *kvm, p4d_t *p4d,
+				  unsigned int lpid)
+{
+	unsigned long iu;
+	p4d_t *p = p4d;
+
+	for (iu = 0; iu < PTRS_PER_P4D; ++iu, ++p) {
+		if (!p4d_present(*p))
+			continue;
+		if (p4d_is_leaf(*p)) {
+			p4d_clear(p);
+		} else {
+			pud_t *pud;
+
+			pud = pud_offset(p, 0);
+			kvmppc_unmap_free_pud(kvm, pud, lpid);
+			p4d_clear(p);
+		}
+	}
+	p4d_free(kvm->mm, p4d);
+}
+
 void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 {
 	unsigned long ig;
 
 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
-		pud_t *pud;
+		p4d_t *p4d;
 
 		if (!pgd_present(*pgd))
 			continue;
-		pud = pud_offset(pgd, 0);
-		kvmppc_unmap_free_pud(kvm, pud, lpid);
+		p4d = p4d_offset(pgd, 0);
+		kvmppc_unmap_free_p4d(kvm, p4d, lpid);
 		pgd_clear(pgd);
 	}
 }
@@ -566,6 +588,7 @@  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 		      unsigned long *rmapp, struct rmap_nested **n_rmap)
 {
 	pgd_t *pgd;
+	p4d_t *p4d, *new_p4d = NULL;
 	pud_t *pud, *new_pud = NULL;
 	pmd_t *pmd, *new_pmd = NULL;
 	pte_t *ptep, *new_ptep = NULL;
@@ -573,9 +596,15 @@  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 
 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
 	pgd = pgtable + pgd_index(gpa);
-	pud = NULL;
+	p4d = NULL;
 	if (pgd_present(*pgd))
-		pud = pud_offset(pgd, gpa);
+		p4d = p4d_offset(pgd, gpa);
+	else
+		new_p4d = p4d_alloc_one(kvm->mm, gpa);
+
+	pud = NULL;
+	if (p4d_present(*p4d))
+		pud = pud_offset(p4d, gpa);
 	else
 		new_pud = pud_alloc_one(kvm->mm, gpa);
 
@@ -597,12 +626,18 @@  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 	/* Now traverse again under the lock and change the tree */
 	ret = -ENOMEM;
 	if (pgd_none(*pgd)) {
+		if (!new_p4d)
+			goto out_unlock;
+		pgd_populate(kvm->mm, pgd, new_p4d);
+		new_p4d = NULL;
+	}
+	if (p4d_none(*p4d)) {
 		if (!new_pud)
 			goto out_unlock;
-		pgd_populate(kvm->mm, pgd, new_pud);
+		p4d_populate(kvm->mm, p4d, new_pud);
 		new_pud = NULL;
 	}
-	pud = pud_offset(pgd, gpa);
+	pud = pud_offset(p4d, gpa);
 	if (pud_is_leaf(*pud)) {
 		unsigned long hgpa = gpa & PUD_MASK;
 
@@ -1220,6 +1255,7 @@  static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
 	pgd_t *pgt;
 	struct kvm_nested_guest *nested;
 	pgd_t pgd, *pgdp;
+	p4d_t p4d, *p4dp;
 	pud_t pud, *pudp;
 	pmd_t pmd, *pmdp;
 	pte_t *ptep;
@@ -1298,7 +1334,14 @@  static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
 			continue;
 		}
 
-		pudp = pud_offset(&pgd, gpa);
+		p4dp = p4d_offset(&pgd, gpa);
+		p4d = READ_ONCE(*p4dp);
+		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
+			gpa = (gpa & P4D_MASK) + P4D_SIZE;
+			continue;
+		}
+
+		pudp = pud_offset(&p4d, gpa);
 		pud = READ_ONCE(*pudp);
 		if (!(pud_val(pud) & _PAGE_PRESENT)) {
 			gpa = (gpa & PUD_MASK) + PUD_SIZE;
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 3345f039a876..7a59f6863cec 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -107,13 +107,18 @@  static inline int unmap_patch_area(unsigned long addr)
 	pte_t *ptep;
 	pmd_t *pmdp;
 	pud_t *pudp;
+	p4d_t *p4dp;
 	pgd_t *pgdp;
 
 	pgdp = pgd_offset_k(addr);
 	if (unlikely(!pgdp))
 		return -EINVAL;
 
-	pudp = pud_offset(pgdp, addr);
+	p4dp = p4d_offset(pgdp, addr);
+	if (unlikely(!p4dp))
+		return -EINVAL;
+
+	pudp = pud_offset(p4dp, addr);
 	if (unlikely(!pudp))
 		return -EINVAL;
 
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 0a1c65a2c565..b2fc3e71165c 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -312,7 +312,7 @@  void hash_preload(struct mm_struct *mm, unsigned long ea)
 
 	if (!Hash)
 		return;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);
 	if (!pmd_none(*pmd))
 		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
 }
diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
index 2fcd321040ff..175bc33b41b7 100644
--- a/arch/powerpc/mm/book3s32/tlb.c
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -87,7 +87,7 @@  static void flush_range(struct mm_struct *mm, unsigned long start,
 	if (start >= end)
 		return;
 	end = (end - 1) | ~PAGE_MASK;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start);
 	for (;;) {
 		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
 		if (pmd_end > end)
@@ -145,7 +145,7 @@  void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 		return;
 	}
 	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr);
 	if (!pmd_none(*pmd))
 		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
 }
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 64733b9cb20a..9cd15937e88a 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -148,6 +148,7 @@  void hash__vmemmap_remove_mapping(unsigned long start,
 int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 {
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -155,7 +156,8 @@  int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
 	if (slab_is_available()) {
 		pgdp = pgd_offset_k(ea);
-		pudp = pud_alloc(&init_mm, pgdp, ea);
+		p4dp = p4d_offset(pgdp, ea);
+		pudp = pud_alloc(&init_mm, p4dp, ea);
 		if (!pudp)
 			return -ENOMEM;
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index dd1bea45325c..11762556fe4d 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -64,17 +64,24 @@  static int early_map_kernel_page(unsigned long ea, unsigned long pa,
 {
 	unsigned long pfn = pa >> PAGE_SHIFT;
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
 
 	pgdp = pgd_offset_k(ea);
 	if (pgd_none(*pgdp)) {
+		p4dp = early_alloc_pgtable(PGD_TABLE_SIZE, nid,
+						region_start, region_end);
+		pgd_populate(&init_mm, pgdp, p4dp);
+	}
+	p4dp = p4d_offset(pgdp, ea);
+	if (p4d_none(*p4dp)) {
 		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
 						region_start, region_end);
-		pgd_populate(&init_mm, pgdp, pudp);
+		p4d_populate(&init_mm, p4dp, pudp);
 	}
-	pudp = pud_offset(pgdp, ea);
+	pudp = pud_offset(p4dp, ea);
 	if (map_page_size == PUD_SIZE) {
 		ptep = (pte_t *)pudp;
 		goto set_the_pte;
@@ -114,6 +121,7 @@  static int __map_kernel_page(unsigned long ea, unsigned long pa,
 {
 	unsigned long pfn = pa >> PAGE_SHIFT;
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -136,7 +144,8 @@  static int __map_kernel_page(unsigned long ea, unsigned long pa,
 	 * boot.
 	 */
 	pgdp = pgd_offset_k(ea);
-	pudp = pud_alloc(&init_mm, pgdp, ea);
+	p4dp = p4d_offset(pgdp, ea);
+	pudp = pud_alloc(&init_mm, p4dp, ea);
 	if (!pudp)
 		return -ENOMEM;
 	if (map_page_size == PUD_SIZE) {
@@ -173,6 +182,7 @@  void radix__change_memory_range(unsigned long start, unsigned long end,
 {
 	unsigned long idx;
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -185,7 +195,8 @@  void radix__change_memory_range(unsigned long start, unsigned long end,
 
 	for (idx = start; idx < end; idx += PAGE_SIZE) {
 		pgdp = pgd_offset_k(idx);
-		pudp = pud_alloc(&init_mm, pgdp, idx);
+		p4dp = p4d_offset(pgdp, idx);
+		pudp = pud_alloc(&init_mm, p4dp, idx);
 		if (!pudp)
 			continue;
 		if (pud_is_leaf(*pudp)) {
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 2ef24a53f4c9..27daeed1a141 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -54,6 +54,7 @@  static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -62,7 +63,10 @@  static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none(*pgd))
 		return;
-	pud = pud_offset(pgd, addr);
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return;
+	pud = pud_offset(p4d, addr);
 	if (pud_none(*pud))
 		return;
 	pmd = pmd_offset(pud, addr);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 73d4873fc7f8..43d463f20fc3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -112,6 +112,7 @@  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
+	p4d_t *p4;
 	pud_t *pu;
 	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
@@ -121,20 +122,21 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 
 	addr &= ~(sz-1);
 	pg = pgd_offset(mm, addr);
+	p4 = p4d_offset(pg, addr);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (pshift == PGDIR_SHIFT)
 		/* 16GB huge page */
-		return (pte_t *) pg;
+		return (pte_t *) p4;
 	else if (pshift > PUD_SHIFT) {
 		/*
 		 * We need to use hugepd table
 		 */
 		ptl = &mm->page_table_lock;
-		hpdp = (hugepd_t *)pg;
+		hpdp = (hugepd_t *)p4;
 	} else {
 		pdshift = PUD_SHIFT;
-		pu = pud_alloc(mm, pg, addr);
+		pu = pud_alloc(mm, p4, addr);
 		if (!pu)
 			return NULL;
 		if (pshift == PUD_SHIFT)
@@ -159,10 +161,10 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 #else
 	if (pshift >= PGDIR_SHIFT) {
 		ptl = &mm->page_table_lock;
-		hpdp = (hugepd_t *)pg;
+		hpdp = (hugepd_t *)p4;
 	} else {
 		pdshift = PUD_SHIFT;
-		pu = pud_alloc(mm, pg, addr);
+		pu = pud_alloc(mm, p4, addr);
 		if (!pu)
 			return NULL;
 		if (pshift >= PUD_SHIFT) {
@@ -384,7 +386,7 @@  static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	mm_dec_nr_pmds(tlb->mm);
 }
 
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 				   unsigned long addr, unsigned long end,
 				   unsigned long floor, unsigned long ceiling)
 {
@@ -394,7 +396,7 @@  static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 
 	start = addr;
 	do {
-		pud = pud_offset(pgd, addr);
+		pud = pud_offset(p4d, addr);
 		next = pud_addr_end(addr, end);
 		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
 			if (pud_none_or_clear_bad(pud))
@@ -429,8 +431,8 @@  static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	if (end - 1 > ceiling - 1)
 		return;
 
-	pud = pud_offset(pgd, start);
-	pgd_clear(pgd);
+	pud = pud_offset(p4d, start);
+	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
 	mm_dec_nr_puds(tlb->mm);
 }
@@ -443,6 +445,7 @@  void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long floor, unsigned long ceiling)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	unsigned long next;
 
 	/*
@@ -465,10 +468,11 @@  void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	do {
 		next = pgd_addr_end(addr, end);
 		pgd = pgd_offset(tlb->mm, addr);
+		p4d = p4d_offset(pgd, addr);
 		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
-			if (pgd_none_or_clear_bad(pgd))
+			if (p4d_none_or_clear_bad(p4d))
 				continue;
-			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+			hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
 		} else {
 			unsigned long more;
 			/*
@@ -481,7 +485,7 @@  void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			if (more > next)
 				next = more;
 
-			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+			free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
 					  addr, next, floor, ceiling);
 		}
 	} while (addr = next, addr != end);
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 16dd95bd0749..eed3f1ae3b90 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -36,7 +36,7 @@  static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned
 	unsigned long k_cur, k_next;
 	pte_t *new = NULL;
 
-	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
+	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_start), k_start), k_start), k_start);
 
 	for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
 		k_next = pgd_addr_end(k_cur, k_end);
@@ -78,7 +78,7 @@  static int __init kasan_init_region(void *start, size_t size)
 	block = memblock_alloc(k_end - k_start, PAGE_SIZE);
 
 	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
-		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
 		void *va = block + k_cur - k_start;
 		pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
 
@@ -102,7 +102,7 @@  static void __init kasan_remap_early_shadow_ro(void)
 	kasan_populate_pte(kasan_early_shadow_pte, prot);
 
 	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
-		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(k_cur), k_cur), k_cur), k_cur);
 		pte_t *ptep = pte_offset_kernel(pmd, k_cur);
 
 		if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
@@ -202,7 +202,7 @@  void __init kasan_early_init(void)
 	unsigned long addr = KASAN_SHADOW_START;
 	unsigned long end = KASAN_SHADOW_END;
 	unsigned long next;
-	pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
+	pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(addr), addr), addr), addr);
 
 	BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ef7b1119b2e2..8262b384dcf3 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -69,8 +69,8 @@  EXPORT_SYMBOL(kmap_prot);
 
 static inline pte_t *virt_to_kpte(unsigned long vaddr)
 {
-	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
-			vaddr), vaddr), vaddr);
+	return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr), vaddr);
 }
 #endif
 
diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
index f348104eb461..7aaf7155e350 100644
--- a/arch/powerpc/mm/nohash/40x.c
+++ b/arch/powerpc/mm/nohash/40x.c
@@ -104,7 +104,7 @@  unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
 
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
 		*pmdp++ = __pmd(val);
 		*pmdp++ = __pmd(val);
 		*pmdp++ = __pmd(val);
@@ -119,7 +119,7 @@  unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
 
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		pmdp = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(v), v), v), v);
 		*pmdp = __pmd(val);
 
 		v += LARGE_PAGE_SIZE_4M;
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 4637fdd469cf..a62d59a928be 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -73,6 +73,7 @@  static void __init *early_alloc_pgtable(unsigned long size)
 int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 {
 	pgd_t *pgdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -80,7 +81,10 @@  int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 	BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
 	if (slab_is_available()) {
 		pgdp = pgd_offset_k(ea);
-		pudp = pud_alloc(&init_mm, pgdp, ea);
+		p4dp = p4d_alloc(&init_mm, pgdp, ea);
+		if (!p4dp)
+			return -ENOMEM;
+		pudp = pud_alloc(&init_mm, p4dp, ea);
 		if (!pudp)
 			return -ENOMEM;
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
@@ -91,13 +95,16 @@  int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 			return -ENOMEM;
 	} else {
 		pgdp = pgd_offset_k(ea);
-#ifndef __PAGETABLE_PUD_FOLDED
 		if (pgd_none(*pgdp)) {
 			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
 			pgd_populate(&init_mm, pgdp, pudp);
 		}
-#endif /* !__PAGETABLE_PUD_FOLDED */
-		pudp = pud_offset(pgdp, ea);
+		p4dp = p4d_offset(pgdp, ea);
+		if (p4d_none(*p4dp)) {
+			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+			p4d_populate(&init_mm, p4dp, pmdp);
+		}
+		pudp = pud_offset(p4dp, ea);
 		if (pud_none(*pudp)) {
 			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
 			pud_populate(&init_mm, pudp, pmdp);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index e3759b69f81b..dca6a72da26a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -265,6 +265,7 @@  int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
@@ -272,7 +273,9 @@  void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 		return;
 	pgd = mm->pgd + pgd_index(addr);
 	BUG_ON(pgd_none(*pgd));
-	pud = pud_offset(pgd, addr);
+	p4d = p4d_offset(pgd, addr);
+	BUG_ON(p4d_none(*p4d));
+	pud = pud_offset(p4d, addr);
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, addr);
 	/*
@@ -313,6 +316,7 @@  pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 			bool *is_thp, unsigned *hpage_shift)
 {
 	pgd_t pgd, *pgdp;
+	p4d_t p4d, *p4dp;
 	pud_t pud, *pudp;
 	pmd_t pmd, *pmdp;
 	pte_t *ret_pte;
@@ -346,13 +350,30 @@  pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 		goto out_huge;
 	}
 
+	pdshift = P4D_SHIFT;
+	p4dp = p4d_offset(&pgd, ea);
+	p4d  = READ_ONCE(*p4dp);
+
+	if (p4d_none(p4d))
+		return NULL;
+
+	if (p4d_is_leaf(p4d)) {
+		ret_pte = (pte_t *)p4dp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+		hpdp = (hugepd_t *)&p4d;
+		goto out_huge;
+	}
+
 	/*
 	 * Even if we end up with an unmap, the pgtable will not
 	 * be freed, because we do an rcu free and here we are
 	 * irq disabled
 	 */
 	pdshift = PUD_SHIFT;
-	pudp = pud_offset(&pgd, ea);
+	pudp = pud_offset(&p4d, ea);
 	pud  = READ_ONCE(*pudp);
 
 	if (pud_none(pud))
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5fb90edd865e..ad217e5e039f 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -63,7 +63,7 @@  int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
 	int err = -ENOMEM;
 
 	/* Use upper 10 bits of VA to index the first level map */
-	pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
+	pd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
 	/* Use middle 10 bits of VA to index the second-level map */
 	if (likely(slab_is_available()))
 		pg = pte_alloc_kernel(pd, va);
@@ -130,6 +130,7 @@  static int
 get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
 {
         pgd_t	*pgd;
+	p4d_t	*p4d;
 	pud_t	*pud;
         pmd_t	*pmd;
         pte_t	*pte;
@@ -137,17 +138,20 @@  get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
 
         pgd = pgd_offset(mm, addr & PAGE_MASK);
         if (pgd) {
-		pud = pud_offset(pgd, addr & PAGE_MASK);
-		if (pud && pud_present(*pud)) {
-			pmd = pmd_offset(pud, addr & PAGE_MASK);
-			if (pmd_present(*pmd)) {
-				pte = pte_offset_map(pmd, addr & PAGE_MASK);
-				if (pte) {
-					retval = 1;
-					*ptep = pte;
-					if (pmdp)
-						*pmdp = pmd;
-					/* XXX caller needs to do pte_unmap, yuck */
+		p4d = p4d_offset(pgd, addr & PAGE_MASK);
+		if (p4d && p4d_present(*p4d)) {
+			pud = pud_offset(p4d, addr & PAGE_MASK);
+			if (pud && pud_present(*pud)) {
+				pmd = pmd_offset(pud, addr & PAGE_MASK);
+				if (pmd_present(*pmd)) {
+					pte = pte_offset_map(pmd, addr & PAGE_MASK);
+					if (pte) {
+						retval = 1;
+						*ptep = pte;
+						if (pmdp)
+							*pmdp = pmd;
+						/* XXX caller needs to do pte_unmap, yuck */
+					}
 				}
 			}
 		}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e78832dce7bb..1f86a88fd4bb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -101,13 +101,13 @@  EXPORT_SYMBOL(__pte_frag_size_shift);
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /* 4 level page table */
-struct page *pgd_page(pgd_t pgd)
+struct page *p4d_page(p4d_t p4d)
 {
-	if (pgd_is_leaf(pgd)) {
-		VM_WARN_ON(!pgd_huge(pgd));
-		return pte_page(pgd_pte(pgd));
+	if (p4d_is_leaf(p4d)) {
+		VM_WARN_ON(!p4d_huge(p4d));
+		return pte_page(p4d_pte(p4d));
 	}
-	return virt_to_page(pgd_page_vaddr(pgd));
+	return virt_to_page(p4d_page_vaddr(p4d));
 }
 #endif
 
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index a07278027c6f..ac360ad865a8 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -417,9 +417,9 @@  static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
 	}
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
 {
-	pud_t *pud = pud_offset(pgd, 0);
+	pud_t *pud = pud_offset(p4d, 0);
 	unsigned long addr;
 	unsigned int i;
 
@@ -431,6 +431,20 @@  static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
 	}
 }
 
+static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+	p4d_t *p4d = p4d_offset(pgd, 0);
+	unsigned long addr;
+	unsigned int i;
+
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+		addr = start + i * P4D_SIZE;
+		if (!p4d_none(*p4d))
+			/* p4d exists */
+			walk_pud(st, p4d, addr);
+	}
+}
+
 static void walk_pagetables(struct pg_state *st)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -445,7 +459,7 @@  static void walk_pagetables(struct pg_state *st)
 		addr = KERN_VIRT_START + i * PGDIR_SIZE;
 		if (!pgd_none(*pgd))
 			/* pgd exists */
-			walk_pud(st, pgd, addr);
+			walk_p4d(st, pgd, addr);
 	}
 }
 
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 206156255247..7bd4b81d5b5d 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -277,9 +277,9 @@  static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
 	}
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
 {
-	pud_t *pud = pud_offset(pgd, 0);
+	pud_t *pud = pud_offset(p4d, 0);
 	unsigned long addr;
 	unsigned int i;
 
@@ -293,6 +293,22 @@  static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
 	}
 }
 
+static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+	p4d_t *p4d = p4d_offset(pgd, 0);
+	unsigned long addr;
+	unsigned int i;
+
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+		addr = start + i * P4D_SIZE;
+		if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
+			/* p4d exists */
+			walk_pud(st, p4d, addr);
+		else
+			note_page(st, addr, 2, p4d_val(*p4d));
+	}
+}
+
 static void walk_pagetables(struct pg_state *st)
 {
 	unsigned int i;
@@ -306,7 +322,7 @@  static void walk_pagetables(struct pg_state *st)
 	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
 		if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
 			/* pgd exists */
-			walk_pud(st, pgd, addr);
+			walk_p4d(st, pgd, addr);
 		else
 			note_page(st, addr, 1, pgd_val(*pgd));
 	}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e8c84d265602..c7bd1145b268 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3130,6 +3130,7 @@  static void show_pte(unsigned long addr)
 	struct task_struct *tsk = NULL;
 	struct mm_struct *mm;
 	pgd_t *pgdp, *pgdir;
+	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
@@ -3174,7 +3175,21 @@  static void show_pte(unsigned long addr)
 	}
 	printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp));
 
-	pudp = pud_offset(pgdp, addr);
+	p4dp = p4d_offset(pgdp, addr);
+
+	if (p4d_none(*p4dp)) {
+		printf("No valid P4D\n");
+		return;
+	}
+
+	if (p4d_is_leaf(*p4dp)) {
+		format_pte(p4dp, p4d_val(*p4dp));
+		return;
+	}
+
+	printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp));
+
+	pudp = pud_offset(p4dp, addr);
 
 	if (pud_none(*pudp)) {
 		printf("No valid PUD\n");