diff mbox series

[v3,07/13] riscv: Implement sv48 support

Message ID 20211206104657.433304-8-alexandre.ghiti@canonical.com (mailing list archive)
State New, archived
Headers show
Series Introduce sv48 support without relocatable kernel | expand

Commit Message

Alexandre Ghiti Dec. 6, 2021, 10:46 a.m. UTC
By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that offers
128TB of virtual address space to userspace and allows up to 64TB of
physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
---
 arch/riscv/Kconfig                      |   4 +-
 arch/riscv/include/asm/csr.h            |   3 +-
 arch/riscv/include/asm/fixmap.h         |   1 +
 arch/riscv/include/asm/kasan.h          |   6 +-
 arch/riscv/include/asm/page.h           |  14 ++
 arch/riscv/include/asm/pgalloc.h        |  40 +++++
 arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
 arch/riscv/include/asm/pgtable.h        |  24 ++-
 arch/riscv/kernel/head.S                |   3 +-
 arch/riscv/mm/context.c                 |   4 +-
 arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
 arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
 drivers/firmware/efi/libstub/efi-stub.c |   2 +
 13 files changed, 514 insertions(+), 44 deletions(-)

Comments

Alexandre Ghiti Dec. 6, 2021, 11:05 a.m. UTC | #1
On 12/6/21 11:46, Alexandre Ghiti wrote:
> By adding a new 4th level of page table, give the possibility to 64bit
> kernel to address 2^48 bytes of virtual address: in practice, that offers
> 128TB of virtual address space to userspace and allows up to 64TB of
> physical memory.
>
> If the underlying hardware does not support sv48, we will automatically
> fallback to a standard 3-level page table by folding the new PUD level into
> PGDIR level. In order to detect HW capabilities at runtime, we
> use SATP feature that ignores writes with an unsupported mode.
>
> Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
> ---
>   arch/riscv/Kconfig                      |   4 +-
>   arch/riscv/include/asm/csr.h            |   3 +-
>   arch/riscv/include/asm/fixmap.h         |   1 +
>   arch/riscv/include/asm/kasan.h          |   6 +-
>   arch/riscv/include/asm/page.h           |  14 ++
>   arch/riscv/include/asm/pgalloc.h        |  40 +++++
>   arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
>   arch/riscv/include/asm/pgtable.h        |  24 ++-
>   arch/riscv/kernel/head.S                |   3 +-
>   arch/riscv/mm/context.c                 |   4 +-
>   arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
>   arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
>   drivers/firmware/efi/libstub/efi-stub.c |   2 +
>   13 files changed, 514 insertions(+), 44 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index ac6c0cd9bc29..d28fe0148e13 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -150,7 +150,7 @@ config PAGE_OFFSET
>   	hex
>   	default 0xC0000000 if 32BIT
>   	default 0x80000000 if 64BIT && !MMU
> -	default 0xffffffd800000000 if 64BIT
> +	default 0xffffaf8000000000 if 64BIT
>   
>   config KASAN_SHADOW_OFFSET
>   	hex
> @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM
>   
>   config PGTABLE_LEVELS
>   	int
> -	default 3 if 64BIT
> +	default 4 if 64BIT
>   	default 2
>   
>   config LOCKDEP_SUPPORT
> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> index 87ac65696871..3fdb971c7896 100644
> --- a/arch/riscv/include/asm/csr.h
> +++ b/arch/riscv/include/asm/csr.h
> @@ -40,14 +40,13 @@
>   #ifndef CONFIG_64BIT
>   #define SATP_PPN	_AC(0x003FFFFF, UL)
>   #define SATP_MODE_32	_AC(0x80000000, UL)
> -#define SATP_MODE	SATP_MODE_32
>   #define SATP_ASID_BITS	9
>   #define SATP_ASID_SHIFT	22
>   #define SATP_ASID_MASK	_AC(0x1FF, UL)
>   #else
>   #define SATP_PPN	_AC(0x00000FFFFFFFFFFF, UL)
>   #define SATP_MODE_39	_AC(0x8000000000000000, UL)
> -#define SATP_MODE	SATP_MODE_39
> +#define SATP_MODE_48	_AC(0x9000000000000000, UL)
>   #define SATP_ASID_BITS	16
>   #define SATP_ASID_SHIFT	44
>   #define SATP_ASID_MASK	_AC(0xFFFF, UL)
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index 54cbf07fb4e9..58a718573ad6 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -24,6 +24,7 @@ enum fixed_addresses {
>   	FIX_HOLE,
>   	FIX_PTE,
>   	FIX_PMD,
> +	FIX_PUD,
>   	FIX_TEXT_POKE1,
>   	FIX_TEXT_POKE0,
>   	FIX_EARLYCON_MEM_BASE,
> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> index 743e6ff57996..0b85e363e778 100644
> --- a/arch/riscv/include/asm/kasan.h
> +++ b/arch/riscv/include/asm/kasan.h
> @@ -28,7 +28,11 @@
>   #define KASAN_SHADOW_SCALE_SHIFT	3
>   
>   #define KASAN_SHADOW_SIZE	(UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
> -#define KASAN_SHADOW_START	(KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
> +/*
> + * Depending on the size of the virtual address space, the region may not be
> + * aligned on PGDIR_SIZE, so force its alignment to ease its population.
> + */
> +#define KASAN_SHADOW_START	((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
>   #define KASAN_SHADOW_END	MODULES_LOWEST_VADDR
>   #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
>   
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index e03559f9b35e..d089fe46f7d8 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -31,7 +31,20 @@
>    * When not using MMU this corresponds to the first free page in
>    * physical memory (aligned on a page boundary).
>    */
> +#ifdef CONFIG_64BIT
> +#ifdef CONFIG_MMU
> +#define PAGE_OFFSET		kernel_map.page_offset
> +#else
> +#define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
> +#endif
> +/*
> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> + * define the PAGE_OFFSET value for SV39.
> + */
> +#define PAGE_OFFSET_L3		_AC(0xffffffd800000000, UL)
> +#else
>   #define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
> +#endif /* CONFIG_64BIT */
>   
>   /*
>    * Half of the kernel address space (half of the entries of the page global
> @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base;
>   #endif /* CONFIG_MMU */
>   
>   struct kernel_mapping {
> +	unsigned long page_offset;
>   	unsigned long virt_addr;
>   	uintptr_t phys_addr;
>   	uintptr_t size;
> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> index 0af6933a7100..11823004b87a 100644
> --- a/arch/riscv/include/asm/pgalloc.h
> +++ b/arch/riscv/include/asm/pgalloc.h
> @@ -11,6 +11,8 @@
>   #include <asm/tlb.h>
>   
>   #ifdef CONFIG_MMU
> +#define __HAVE_ARCH_PUD_ALLOC_ONE
> +#define __HAVE_ARCH_PUD_FREE
>   #include <asm-generic/pgalloc.h>
>   
>   static inline void pmd_populate_kernel(struct mm_struct *mm,
> @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
>   
>   	set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
>   }
> +
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> +{
> +	if (pgtable_l4_enabled) {
> +		unsigned long pfn = virt_to_pfn(pud);
> +
> +		set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> +	}
> +}
> +
> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> +				     pud_t *pud)
> +{
> +	if (pgtable_l4_enabled) {
> +		unsigned long pfn = virt_to_pfn(pud);
> +
> +		set_p4d_safe(p4d,
> +			     __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> +	}
> +}
> +
> +#define pud_alloc_one pud_alloc_one
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> +	if (pgtable_l4_enabled)
> +		return __pud_alloc_one(mm, addr);
> +
> +	return NULL;
> +}
> +
> +#define pud_free pud_free
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> +	if (pgtable_l4_enabled)
> +		__pud_free(mm, pud);
> +}
> +
> +#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
>   #endif /* __PAGETABLE_PMD_FOLDED */
>   
>   static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index 228261aa9628..bbbdd66e5e2f 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -8,16 +8,36 @@
>   
>   #include <linux/const.h>
>   
> -#define PGDIR_SHIFT     30
> +extern bool pgtable_l4_enabled;
> +
> +#define PGDIR_SHIFT_L3  30
> +#define PGDIR_SHIFT_L4  39
> +#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
> +
> +#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
>   /* Size of region mapped by a page global directory */
>   #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
>   #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
>   
> +/* pud is folded into pgd in case of 3-level page table */
> +#define PUD_SHIFT      30
> +#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
> +#define PUD_MASK       (~(PUD_SIZE - 1))
> +
>   #define PMD_SHIFT       21
>   /* Size of region mapped by a page middle directory */
>   #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
>   #define PMD_MASK        (~(PMD_SIZE - 1))
>   
> +/* Page Upper Directory entry */
> +typedef struct {
> +	unsigned long pud;
> +} pud_t;
> +
> +#define pud_val(x)      ((x).pud)
> +#define __pud(x)        ((pud_t) { (x) })
> +#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
> +
>   /* Page Middle Directory entry */
>   typedef struct {
>   	unsigned long pmd;
> @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
>   	set_pud(pudp, __pud(0));
>   }
>   
> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> +{
> +	return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> +}
> +
> +static inline unsigned long _pud_pfn(pud_t pud)
> +{
> +	return pud_val(pud) >> _PAGE_PFN_SHIFT;
> +}
> +
>   static inline pmd_t *pud_pgtable(pud_t pud)
>   {
>   	return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
>   	return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
>   }
>   
> +#define mm_pud_folded  mm_pud_folded
> +static inline bool mm_pud_folded(struct mm_struct *mm)
> +{
> +	if (pgtable_l4_enabled)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
> +
>   static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
>   {
>   	return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
>   #define pmd_ERROR(e) \
>   	pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
>   
> +#define pud_ERROR(e)   \
> +	pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> +
> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		*p4dp = p4d;
> +	else
> +		set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> +}
> +
> +static inline int p4d_none(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return (p4d_val(p4d) == 0);
> +
> +	return 0;
> +}
> +
> +static inline int p4d_present(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return (p4d_val(p4d) & _PAGE_PRESENT);
> +
> +	return 1;
> +}
> +
> +static inline int p4d_bad(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return !p4d_present(p4d);
> +
> +	return 0;
> +}
> +
> +static inline void p4d_clear(p4d_t *p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		set_p4d(p4d, __p4d(0));
> +}
> +
> +static inline pud_t *p4d_pgtable(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +
> +	return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
> +}
> +
> +static inline struct page *p4d_page(p4d_t p4d)
> +{
> +	return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +}
> +
> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> +
> +#define pud_offset pud_offset
> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +{
> +	if (pgtable_l4_enabled)
> +		return p4d_pgtable(*p4d) + pud_index(address);
> +
> +	return (pud_t *)p4d;
> +}
> +
>   #endif /* _ASM_RISCV_PGTABLE_64_H */
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index e1a52e22ad7e..e1c74ef4ead2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -51,7 +51,7 @@
>    * position vmemmap directly below the VMALLOC region.
>    */
>   #ifdef CONFIG_64BIT
> -#define VA_BITS		39
> +#define VA_BITS		(pgtable_l4_enabled ? 48 : 39)
>   #else
>   #define VA_BITS		32
>   #endif
> @@ -90,8 +90,7 @@
>   
>   #ifndef __ASSEMBLY__
>   
> -/* Page Upper Directory not used in RISC-V */
> -#include <asm-generic/pgtable-nopud.h>
> +#include <asm-generic/pgtable-nop4d.h>
>   #include <asm/page.h>
>   #include <asm/tlbflush.h>
>   #include <linux/mm_types.h>
> @@ -113,6 +112,17 @@
>   #define XIP_FIXUP(addr)		(addr)
>   #endif /* CONFIG_XIP_KERNEL */
>   
> +struct pt_alloc_ops {
> +	pte_t *(*get_pte_virt)(phys_addr_t pa);
> +	phys_addr_t (*alloc_pte)(uintptr_t va);
> +#ifndef __PAGETABLE_PMD_FOLDED
> +	pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> +	phys_addr_t (*alloc_pmd)(uintptr_t va);
> +	pud_t *(*get_pud_virt)(phys_addr_t pa);
> +	phys_addr_t (*alloc_pud)(uintptr_t va);
> +#endif
> +};
> +
>   #ifdef CONFIG_MMU
>   /* Number of entries in the page global directory */
>   #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
> @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
>    * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
>    */
>   #ifdef CONFIG_64BIT
> -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
> +#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
> +#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
>   #else
> -#define TASK_SIZE FIXADDR_START
> +#define TASK_SIZE	FIXADDR_START
> +#define TASK_SIZE_MIN	TASK_SIZE
>   #endif
>   
>   #else /* CONFIG_MMU */
> @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa;
>   #define dtb_early_va	_dtb_early_va
>   #define dtb_early_pa	_dtb_early_pa
>   #endif /* CONFIG_XIP_KERNEL */
> +extern u64 satp_mode;
> +extern bool pgtable_l4_enabled;
>   
>   void paging_init(void);
>   void misc_mem_init(void);
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 52c5ff9804c5..c3c0ed559770 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -95,7 +95,8 @@ relocate:
>   
>   	/* Compute satp for kernel page tables, but don't load it yet */
>   	srl a2, a0, PAGE_SHIFT
> -	li a1, SATP_MODE
> +	la a1, satp_mode
> +	REG_L a1, 0(a1)
>   	or a2, a2, a1
>   
>   	/*
> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> index ee3459cb6750..a7246872bd30 100644
> --- a/arch/riscv/mm/context.c
> +++ b/arch/riscv/mm/context.c
> @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
>   switch_mm_fast:
>   	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
>   		  ((cntx & asid_mask) << SATP_ASID_SHIFT) |
> -		  SATP_MODE);
> +		  satp_mode);
>   
>   	if (need_flush_tlb)
>   		local_flush_tlb_all();
> @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
>   static void set_mm_noasid(struct mm_struct *mm)
>   {
>   	/* Switch the page table and blindly nuke entire local TLB */
> -	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
> +	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
>   	local_flush_tlb_all();
>   }
>   
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 1552226fb6bd..6a19a1b1caf8 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map);
>   #define kernel_map	(*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
>   #endif
>   
> +#ifdef CONFIG_64BIT
> +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
> +#else
> +u64 satp_mode = SATP_MODE_32;
> +#endif
> +EXPORT_SYMBOL(satp_mode);
> +
> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ?
> +				true : false;
> +EXPORT_SYMBOL(pgtable_l4_enabled);
> +
>   phys_addr_t phys_ram_base __ro_after_init;
>   EXPORT_SYMBOL(phys_ram_base);
>   
> @@ -53,15 +64,6 @@ extern char _start[];
>   void *_dtb_early_va __initdata;
>   uintptr_t _dtb_early_pa __initdata;
>   
> -struct pt_alloc_ops {
> -	pte_t *(*get_pte_virt)(phys_addr_t pa);
> -	phys_addr_t (*alloc_pte)(uintptr_t va);
> -#ifndef __PAGETABLE_PMD_FOLDED
> -	pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> -	phys_addr_t (*alloc_pmd)(uintptr_t va);
> -#endif
> -};
> -
>   static phys_addr_t dma32_phys_limit __initdata;
>   
>   static void __init zone_sizes_init(void)
> @@ -222,7 +224,7 @@ static void __init setup_bootmem(void)
>   }
>   
>   #ifdef CONFIG_MMU
> -static struct pt_alloc_ops _pt_ops __initdata;
> +struct pt_alloc_ops _pt_ops __initdata;
>   
>   #ifdef CONFIG_XIP_KERNEL
>   #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
> @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
>   static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
>   
>   pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
> +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
>   static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>   
>   #ifdef CONFIG_XIP_KERNEL
> @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>   #define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
>   #endif /* CONFIG_XIP_KERNEL */
>   
> +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
> +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
> +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
> +
> +#ifdef CONFIG_XIP_KERNEL
> +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud))
> +#define fixmap_pud     ((pud_t *)XIP_FIXUP(fixmap_pud))
> +#define early_pud      ((pud_t *)XIP_FIXUP(early_pud))
> +#endif /* CONFIG_XIP_KERNEL */
> +
>   static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
>   {
>   	/* Before MMU is enabled */
> @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
>   
>   static phys_addr_t __init alloc_pmd_early(uintptr_t va)
>   {
> -	BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
> +	BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
>   
>   	return (uintptr_t)early_pmd;
>   }
> @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
>   	create_pte_mapping(ptep, va, pa, sz, prot);
>   }
>   
> -#define pgd_next_t		pmd_t
> -#define alloc_pgd_next(__va)	pt_ops.alloc_pmd(__va)
> -#define get_pgd_next_virt(__pa)	pt_ops.get_pmd_virt(__pa)
> +static pud_t *__init get_pud_virt_early(phys_addr_t pa)
> +{
> +	return (pud_t *)((uintptr_t)pa);
> +}
> +
> +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
> +{
> +	clear_fixmap(FIX_PUD);
> +	return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
> +}
> +
> +static pud_t *__init get_pud_virt_late(phys_addr_t pa)
> +{
> +	return (pud_t *)__va(pa);
> +}
> +
> +static phys_addr_t __init alloc_pud_early(uintptr_t va)
> +{
> +	/* Only one PUD is available for early mapping */
> +	BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
> +
> +	return (uintptr_t)early_pud;
> +}
> +
> +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
> +{
> +	return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> +}
> +
> +static phys_addr_t alloc_pud_late(uintptr_t va)
> +{
> +	unsigned long vaddr;
> +
> +	vaddr = __get_free_page(GFP_KERNEL);
> +	BUG_ON(!vaddr);
> +	return __pa(vaddr);
> +}
> +
> +static void __init create_pud_mapping(pud_t *pudp,
> +				      uintptr_t va, phys_addr_t pa,
> +				      phys_addr_t sz, pgprot_t prot)
> +{
> +	pmd_t *nextp;
> +	phys_addr_t next_phys;
> +	uintptr_t pud_index = pud_index(va);
> +
> +	if (sz == PUD_SIZE) {
> +		if (pud_val(pudp[pud_index]) == 0)
> +			pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
> +		return;
> +	}
> +
> +	if (pud_val(pudp[pud_index]) == 0) {
> +		next_phys = pt_ops.alloc_pmd(va);
> +		pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
> +		nextp = pt_ops.get_pmd_virt(next_phys);
> +		memset(nextp, 0, PAGE_SIZE);
> +	} else {
> +		next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
> +		nextp = pt_ops.get_pmd_virt(next_phys);
> +	}
> +
> +	create_pmd_mapping(nextp, va, pa, sz, prot);
> +}
> +
> +#define pgd_next_t		pud_t
> +#define alloc_pgd_next(__va)	(pgtable_l4_enabled ?			\
> +		pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))
> +#define get_pgd_next_virt(__pa)	(pgtable_l4_enabled ?			\
> +		pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa))
>   #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)	\
> -	create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next		fixmap_pmd
> +				(pgtable_l4_enabled ?			\
> +		create_pud_mapping(__nextp, __va, __pa, __sz, __prot) :	\
> +		create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))
> +#define fixmap_pgd_next		(pgtable_l4_enabled ?			\
> +		(uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
> +#define trampoline_pgd_next	(pgtable_l4_enabled ?			\
> +		(uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
> +#define early_dtb_pgd_next	(pgtable_l4_enabled ?			\
> +		(uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)
>   #else
>   #define pgd_next_t		pte_t
>   #define alloc_pgd_next(__va)	pt_ops.alloc_pte(__va)
>   #define get_pgd_next_virt(__pa)	pt_ops.get_pte_virt(__pa)
>   #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)	\
>   	create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next		fixmap_pte
> +#define fixmap_pgd_next		((uintptr_t)fixmap_pte)
> +#define early_dtb_pgd_next	((uintptr_t)early_dtb_pmd)
> +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot)
>   #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot)
> -#endif
> +#endif /* __PAGETABLE_PMD_FOLDED */
>   
>   void __init create_pgd_mapping(pgd_t *pgdp,
>   				      uintptr_t va, phys_addr_t pa,
> @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va)
>   }
>   #endif /* CONFIG_STRICT_KERNEL_RWX */
>   
> +#ifdef CONFIG_64BIT
> +static void __init disable_pgtable_l4(void)
> +{
> +	pgtable_l4_enabled = false;
> +	kernel_map.page_offset = PAGE_OFFSET_L3;
> +	satp_mode = SATP_MODE_39;
> +}
> +
> +/*
> + * There is a simple way to determine if 4-level is supported by the
> + * underlying hardware: establish 1:1 mapping in 4-level page table mode
> + * then read SATP to see if the configuration was taken into account
> + * meaning sv48 is supported.
> + */
> +static __init void set_satp_mode(void)
> +{
> +	u64 identity_satp, hw_satp;
> +	uintptr_t set_satp_mode_pmd;
> +
> +	set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
> +	create_pgd_mapping(early_pg_dir,
> +			   set_satp_mode_pmd, (uintptr_t)early_pud,
> +			   PGDIR_SIZE, PAGE_TABLE);
> +	create_pud_mapping(early_pud,
> +			   set_satp_mode_pmd, (uintptr_t)early_pmd,
> +			   PUD_SIZE, PAGE_TABLE);
> +	/* Handle the case where set_satp_mode straddles 2 PMDs */
> +	create_pmd_mapping(early_pmd,
> +			   set_satp_mode_pmd, set_satp_mode_pmd,
> +			   PMD_SIZE, PAGE_KERNEL_EXEC);
> +	create_pmd_mapping(early_pmd,
> +			   set_satp_mode_pmd + PMD_SIZE,
> +			   set_satp_mode_pmd + PMD_SIZE,
> +			   PMD_SIZE, PAGE_KERNEL_EXEC);
> +
> +	identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> +
> +	local_flush_tlb_all();
> +	csr_write(CSR_SATP, identity_satp);
> +	hw_satp = csr_swap(CSR_SATP, 0ULL);
> +	local_flush_tlb_all();
> +
> +	if (hw_satp != identity_satp)
> +		disable_pgtable_l4();
> +
> +	memset(early_pg_dir, 0, PAGE_SIZE);
> +	memset(early_pud, 0, PAGE_SIZE);
> +	memset(early_pmd, 0, PAGE_SIZE);
> +}
> +#endif
> +
>   /*
>    * setup_vm() is called from head.S with MMU-off.
>    *
> @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
>   	uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1);
>   
>   	create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA,
> -			   IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa,
> +			   IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa,
>   			   PGDIR_SIZE,
>   			   IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL);
>   
> +	if (pgtable_l4_enabled) {
> +		create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA,
> +				   (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE);
> +	}
> +
>   	if (IS_ENABLED(CONFIG_64BIT)) {
>   		create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
>   				   pa, PMD_SIZE, PAGE_KERNEL);
> @@ -593,6 +738,8 @@ void pt_ops_set_early(void)
>   #ifndef __PAGETABLE_PMD_FOLDED
>   	pt_ops.alloc_pmd = alloc_pmd_early;
>   	pt_ops.get_pmd_virt = get_pmd_virt_early;
> +	pt_ops.alloc_pud = alloc_pud_early;
> +	pt_ops.get_pud_virt = get_pud_virt_early;
>   #endif
>   }
>   
> @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void)
>   #ifndef __PAGETABLE_PMD_FOLDED
>   	pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap);
>   	pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap);
> +	pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap);
> +	pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap);
>   #endif
>   }
>   
> @@ -625,6 +774,8 @@ void pt_ops_set_late(void)
>   #ifndef __PAGETABLE_PMD_FOLDED
>   	pt_ops.alloc_pmd = alloc_pmd_late;
>   	pt_ops.get_pmd_virt = get_pmd_virt_late;
> +	pt_ops.alloc_pud = alloc_pud_late;
> +	pt_ops.get_pud_virt = get_pud_virt_late;
>   #endif
>   }
>   
> @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>   	pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd;
>   
>   	kernel_map.virt_addr = KERNEL_LINK_ADDR;
> +	kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
>   
>   #ifdef CONFIG_XIP_KERNEL
>   	kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
> @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>   	kernel_map.phys_addr = (uintptr_t)(&_start);
>   	kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
>   #endif
> +
> +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
> +	set_satp_mode();
> +#endif
> +
>   	kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
>   	kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
>   
> @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>   
>   	/* Setup early PGD for fixmap */
>   	create_pgd_mapping(early_pg_dir, FIXADDR_START,
> -			   (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> +			   fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>   
>   #ifndef __PAGETABLE_PMD_FOLDED
> -	/* Setup fixmap PMD */
> +	/* Setup fixmap PUD and PMD */
> +	if (pgtable_l4_enabled)
> +		create_pud_mapping(fixmap_pud, FIXADDR_START,
> +				   (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
>   	create_pmd_mapping(fixmap_pmd, FIXADDR_START,
>   			   (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
>   	/* Setup trampoline PGD and PMD */
>   	create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
> -			   (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> +			   trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> +	if (pgtable_l4_enabled)
> +		create_pud_mapping(trampoline_pud, kernel_map.virt_addr,
> +				   (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
>   #ifdef CONFIG_XIP_KERNEL
>   	create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
>   			   kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
> @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>   	 * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap
>   	 * range can not span multiple pmds.
>   	 */
> -	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
> +	BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
>   		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
>   
>   #ifndef __PAGETABLE_PMD_FOLDED
> @@ -783,9 +946,10 @@ static void __init setup_vm_final(void)
>   	/* Clear fixmap PTE and PMD mappings */
>   	clear_fixmap(FIX_PTE);
>   	clear_fixmap(FIX_PMD);
> +	clear_fixmap(FIX_PUD);
>   
>   	/* Move to swapper page table */
> -	csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> +	csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
>   	local_flush_tlb_all();
>   
>   	pt_ops_set_late();
> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> index 1434a0225140..993f50571a3b 100644
> --- a/arch/riscv/mm/kasan_init.c
> +++ b/arch/riscv/mm/kasan_init.c
> @@ -11,7 +11,29 @@
>   #include <asm/fixmap.h>
>   #include <asm/pgalloc.h>
>   
> +/*
> + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57
> + * which is right before the kernel.
> + *
> + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate
> + * the page global directory with kasan_early_shadow_pmd.
> + *
> + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping
> + * must be divided as follows:
> + * - the first PGD entry, although incomplete, is populated with
> + *   kasan_early_shadow_pud/p4d
> + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d
> + * - the last PGD entry is shared with the kernel mapping so populated at the
> + *   lower levels pud/p4d
> + *
> + * In addition, when shallow populating a kasan region (for example vmalloc),
> + * this region may also not be aligned on PGDIR size, so we must go down to the
> + * pud level too.
> + */
> +
>   extern pgd_t early_pg_dir[PTRS_PER_PGD];
> +extern struct pt_alloc_ops _pt_ops __initdata;
> +#define pt_ops	_pt_ops
>   
>   static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
>   {
> @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
>   	set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
>   }
>   
> -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
> +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end)
>   {
>   	phys_addr_t phys_addr;
>   	pmd_t *pmdp, *base_pmd;
>   	unsigned long next;
>   
> -	base_pmd = (pmd_t *)pgd_page_vaddr(*pgd);
> -	if (base_pmd == lm_alias(kasan_early_shadow_pmd))
> +	if (pud_none(*pud)) {
>   		base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
> +	} else {
> +		base_pmd = (pmd_t *)pud_pgtable(*pud);
> +		if (base_pmd == lm_alias(kasan_early_shadow_pmd))
> +			base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
> +	}
>   
>   	pmdp = base_pmd + pmd_index(vaddr);
>   
> @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned
>   	 * it entirely, memblock could allocate a page at a physical address
>   	 * where KASAN is not populated yet and then we'd get a page fault.
>   	 */
> -	set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> +	set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> +}
> +
> +static void __init kasan_populate_pud(pgd_t *pgd,
> +				      unsigned long vaddr, unsigned long end,
> +				      bool early)
> +{
> +	phys_addr_t phys_addr;
> +	pud_t *pudp, *base_pud;
> +	unsigned long next;
> +
> +	if (early) {
> +		/*
> +		 * We can't use pgd_page_vaddr here as it would return a linear
> +		 * mapping address but it is not mapped yet, but when populating
> +		 * early_pg_dir, we need the physical address and when populating
> +		 * swapper_pg_dir, we need the kernel virtual address so use
> +		 * pt_ops facility.
> +		 */
> +		base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd)));
> +	} else {
> +		base_pud = (pud_t *)pgd_page_vaddr(*pgd);
> +		if (base_pud == lm_alias(kasan_early_shadow_pud))
> +			base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
> +	}
> +
> +	pudp = base_pud + pud_index(vaddr);
> +
> +	do {
> +		next = pud_addr_end(vaddr, end);
> +
> +		if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) {
> +			if (early) {
> +				phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd));
> +				set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE));
> +				continue;
> +			} else {
> +				phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE);
> +				if (phys_addr) {
> +					set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL));
> +					continue;
> +				}
> +			}
> +		}
> +
> +		kasan_populate_pmd(pudp, vaddr, next);
> +	} while (pudp++, vaddr = next, vaddr != end);
> +
> +	/*
> +	 * Wait for the whole PGD to be populated before setting the PGD in
> +	 * the page table, otherwise, if we did set the PGD before populating
> +	 * it entirely, memblock could allocate a page at a physical address
> +	 * where KASAN is not populated yet and then we'd get a page fault.
> +	 */
> +	if (!early)
> +		set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE));
>   }
>   
> +#define kasan_early_shadow_pgd_next			(pgtable_l4_enabled ?	\
> +				(uintptr_t)kasan_early_shadow_pud :		\
> +				(uintptr_t)kasan_early_shadow_pmd)
> +#define kasan_populate_pgd_next(pgdp, vaddr, next, early)			\
> +		(pgtable_l4_enabled ?						\
> +			kasan_populate_pud(pgdp, vaddr, next, early) :		\
> +			kasan_populate_pmd((pud_t *)pgdp, vaddr, next))
> +
>   static void __init kasan_populate_pgd(pgd_t *pgdp,
>   				      unsigned long vaddr, unsigned long end,
>   				      bool early)
> @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp,
>   			}
>   		}
>   
> -		kasan_populate_pmd(pgdp, vaddr, next);
> +		kasan_populate_pgd_next(pgdp, vaddr, next, early);
>   	} while (pgdp++, vaddr = next, vaddr != end);
>   }
>   
> @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end)
>   	memset(start, KASAN_SHADOW_INIT, end - start);
>   }
>   
> +static void __init kasan_shallow_populate_pud(pgd_t *pgdp,
> +					      unsigned long vaddr, unsigned long end,
> +					      bool kasan_populate)
> +{
> +	unsigned long next;
> +	pud_t *pudp, *base_pud;
> +	pmd_t *base_pmd;
> +	bool is_kasan_pmd;
> +
> +	base_pud = (pud_t *)pgd_page_vaddr(*pgdp);
> +	pudp = base_pud + pud_index(vaddr);
> +
> +	if (kasan_populate)
> +		memcpy(base_pud, (void *)kasan_early_shadow_pgd_next,
> +		       sizeof(pud_t) * PTRS_PER_PUD);
> +
> +	do {
> +		next = pud_addr_end(vaddr, end);
> +		is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd));
> +
> +		if (is_kasan_pmd) {
> +			base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> +			set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> +		}
> +	} while (pudp++, vaddr = next, vaddr != end);
> +}
> +
>   static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
>   {
>   	unsigned long next;
>   	void *p;
>   	pgd_t *pgd_k = pgd_offset_k(vaddr);
> +	bool is_kasan_pgd_next;
>   
>   	do {
>   		next = pgd_addr_end(vaddr, end);
> -		if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
> +		is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) ==
> +				     (unsigned long)lm_alias(kasan_early_shadow_pgd_next));
> +
> +		if (is_kasan_pgd_next) {
>   			p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
>   			set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
>   		}
> +
> +		if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE)
> +			continue;
> +
> +		kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next);
>   	} while (pgd_k++, vaddr = next, vaddr != end);
>   }


@Qinglin: I can deal with sv57 kasan population if needs be as it is a 
bit tricky and I think it would save you quite some time :)


>   
> diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
> index 26e69788f27a..b3db5d91ed38 100644
> --- a/drivers/firmware/efi/libstub/efi-stub.c
> +++ b/drivers/firmware/efi/libstub/efi-stub.c
> @@ -40,6 +40,8 @@
>   
>   #ifdef CONFIG_ARM64
>   # define EFI_RT_VIRTUAL_LIMIT	DEFAULT_MAP_WINDOW_64
> +#elif defined(CONFIG_RISCV)
> +# define EFI_RT_VIRTUAL_LIMIT	TASK_SIZE_MIN
>   #else
>   # define EFI_RT_VIRTUAL_LIMIT	TASK_SIZE
>   #endif
Qinglin Pan Dec. 9, 2021, 4:32 a.m. UTC | #2
Hi Alex,

On 2021/12/6 19:05, Alexandre ghiti wrote:
 > On 12/6/21 11:46, Alexandre Ghiti wrote:
 >> By adding a new 4th level of page table, give the possibility to 64bit
 >> kernel to address 2^48 bytes of virtual address: in practice, that 
offers
 >> 128TB of virtual address space to userspace and allows up to 64TB of
 >> physical memory.
 >>
 >> If the underlying hardware does not support sv48, we will automatically
 >> fallback to a standard 3-level page table by folding the new PUD 
level into
 >> PGDIR level. In order to detect HW capabilities at runtime, we
 >> use SATP feature that ignores writes with an unsupported mode.
 >>
 >> Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
 >> ---
 >>   arch/riscv/Kconfig                      |   4 +-
 >>   arch/riscv/include/asm/csr.h            |   3 +-
 >>   arch/riscv/include/asm/fixmap.h         |   1 +
 >>   arch/riscv/include/asm/kasan.h          |   6 +-
 >>   arch/riscv/include/asm/page.h           |  14 ++
 >>   arch/riscv/include/asm/pgalloc.h        |  40 +++++
 >>   arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
 >>   arch/riscv/include/asm/pgtable.h        |  24 ++-
 >>   arch/riscv/kernel/head.S                |   3 +-
 >>   arch/riscv/mm/context.c                 |   4 +-
 >>   arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
 >>   arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
 >>   drivers/firmware/efi/libstub/efi-stub.c |   2 +
 >>   13 files changed, 514 insertions(+), 44 deletions(-)
 >>
 >> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
 >> index ac6c0cd9bc29..d28fe0148e13 100644
 >> --- a/arch/riscv/Kconfig
 >> +++ b/arch/riscv/Kconfig
 >> @@ -150,7 +150,7 @@ config PAGE_OFFSET
 >>       hex
 >>       default 0xC0000000 if 32BIT
 >>       default 0x80000000 if 64BIT && !MMU
 >> -    default 0xffffffd800000000 if 64BIT
 >> +    default 0xffffaf8000000000 if 64BIT
 >>     config KASAN_SHADOW_OFFSET
 >>       hex
 >> @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM
 >>     config PGTABLE_LEVELS
 >>       int
 >> -    default 3 if 64BIT
 >> +    default 4 if 64BIT
 >>       default 2
 >>     config LOCKDEP_SUPPORT
 >> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
 >> index 87ac65696871..3fdb971c7896 100644
 >> --- a/arch/riscv/include/asm/csr.h
 >> +++ b/arch/riscv/include/asm/csr.h
 >> @@ -40,14 +40,13 @@
 >>   #ifndef CONFIG_64BIT
 >>   #define SATP_PPN    _AC(0x003FFFFF, UL)
 >>   #define SATP_MODE_32    _AC(0x80000000, UL)
 >> -#define SATP_MODE    SATP_MODE_32
 >>   #define SATP_ASID_BITS    9
 >>   #define SATP_ASID_SHIFT    22
 >>   #define SATP_ASID_MASK    _AC(0x1FF, UL)
 >>   #else
 >>   #define SATP_PPN    _AC(0x00000FFFFFFFFFFF, UL)
 >>   #define SATP_MODE_39    _AC(0x8000000000000000, UL)
 >> -#define SATP_MODE    SATP_MODE_39
 >> +#define SATP_MODE_48    _AC(0x9000000000000000, UL)
 >>   #define SATP_ASID_BITS    16
 >>   #define SATP_ASID_SHIFT    44
 >>   #define SATP_ASID_MASK    _AC(0xFFFF, UL)
 >> diff --git a/arch/riscv/include/asm/fixmap.h 
b/arch/riscv/include/asm/fixmap.h
 >> index 54cbf07fb4e9..58a718573ad6 100644
 >> --- a/arch/riscv/include/asm/fixmap.h
 >> +++ b/arch/riscv/include/asm/fixmap.h
 >> @@ -24,6 +24,7 @@ enum fixed_addresses {
 >>       FIX_HOLE,
 >>       FIX_PTE,
 >>       FIX_PMD,
 >> +    FIX_PUD,
 >>       FIX_TEXT_POKE1,
 >>       FIX_TEXT_POKE0,
 >>       FIX_EARLYCON_MEM_BASE,
 >> diff --git a/arch/riscv/include/asm/kasan.h 
b/arch/riscv/include/asm/kasan.h
 >> index 743e6ff57996..0b85e363e778 100644
 >> --- a/arch/riscv/include/asm/kasan.h
 >> +++ b/arch/riscv/include/asm/kasan.h
 >> @@ -28,7 +28,11 @@
 >>   #define KASAN_SHADOW_SCALE_SHIFT    3
 >>     #define KASAN_SHADOW_SIZE    (UL(1) << ((VA_BITS - 1) - 
KASAN_SHADOW_SCALE_SHIFT))
 >> -#define KASAN_SHADOW_START    (KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
 >> +/*
 >> + * Depending on the size of the virtual address space, the region 
may not be
 >> + * aligned on PGDIR_SIZE, so force its alignment to ease its 
population.
 >> + */
 >> +#define KASAN_SHADOW_START    ((KASAN_SHADOW_END - 
KASAN_SHADOW_SIZE) & PGDIR_MASK)
 >>   #define KASAN_SHADOW_END    MODULES_LOWEST_VADDR
 >>   #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 >>   diff --git a/arch/riscv/include/asm/page.h 
b/arch/riscv/include/asm/page.h
 >> index e03559f9b35e..d089fe46f7d8 100644
 >> --- a/arch/riscv/include/asm/page.h
 >> +++ b/arch/riscv/include/asm/page.h
 >> @@ -31,7 +31,20 @@
 >>    * When not using MMU this corresponds to the first free page in
 >>    * physical memory (aligned on a page boundary).
 >>    */
 >> +#ifdef CONFIG_64BIT
 >> +#ifdef CONFIG_MMU
 >> +#define PAGE_OFFSET        kernel_map.page_offset
 >> +#else
 >> +#define PAGE_OFFSET        _AC(CONFIG_PAGE_OFFSET, UL)
 >> +#endif
 >> +/*
 >> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address 
space so
 >> + * define the PAGE_OFFSET value for SV39.
 >> + */
 >> +#define PAGE_OFFSET_L3        _AC(0xffffffd800000000, UL)
 >> +#else
 >>   #define PAGE_OFFSET        _AC(CONFIG_PAGE_OFFSET, UL)
 >> +#endif /* CONFIG_64BIT */
 >>     /*
 >>    * Half of the kernel address space (half of the entries of the 
page global
 >> @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base;
 >>   #endif /* CONFIG_MMU */
 >>     struct kernel_mapping {
 >> +    unsigned long page_offset;
 >>       unsigned long virt_addr;
 >>       uintptr_t phys_addr;
 >>       uintptr_t size;
 >> diff --git a/arch/riscv/include/asm/pgalloc.h 
b/arch/riscv/include/asm/pgalloc.h
 >> index 0af6933a7100..11823004b87a 100644
 >> --- a/arch/riscv/include/asm/pgalloc.h
 >> +++ b/arch/riscv/include/asm/pgalloc.h
 >> @@ -11,6 +11,8 @@
 >>   #include <asm/tlb.h>
 >>     #ifdef CONFIG_MMU
 >> +#define __HAVE_ARCH_PUD_ALLOC_ONE
 >> +#define __HAVE_ARCH_PUD_FREE
 >>   #include <asm-generic/pgalloc.h>
 >>     static inline void pmd_populate_kernel(struct mm_struct *mm,
 >> @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct 
*mm, pud_t *pud, pmd_t *pmd)
 >>         set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
 >>   }
 >> +
 >> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, 
pud_t *pud)
 >> +{
 >> +    if (pgtable_l4_enabled) {
 >> +        unsigned long pfn = virt_to_pfn(pud);
 >> +
 >> +        set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
 >> +    }
 >> +}
 >> +
 >> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
 >> +                     pud_t *pud)
 >> +{
 >> +    if (pgtable_l4_enabled) {
 >> +        unsigned long pfn = virt_to_pfn(pud);
 >> +
 >> +        set_p4d_safe(p4d,
 >> +                 __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
 >> +    }
 >> +}
 >> +
 >> +#define pud_alloc_one pud_alloc_one
 >> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned 
long addr)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return __pud_alloc_one(mm, addr);
 >> +
 >> +    return NULL;
 >> +}
 >> +
 >> +#define pud_free pud_free
 >> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        __pud_free(mm, pud);
 >> +}
 >> +
 >> +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
 >>   #endif /* __PAGETABLE_PMD_FOLDED */
 >>     static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 >> diff --git a/arch/riscv/include/asm/pgtable-64.h 
b/arch/riscv/include/asm/pgtable-64.h
 >> index 228261aa9628..bbbdd66e5e2f 100644
 >> --- a/arch/riscv/include/asm/pgtable-64.h
 >> +++ b/arch/riscv/include/asm/pgtable-64.h
 >> @@ -8,16 +8,36 @@
 >>     #include <linux/const.h>
 >>   -#define PGDIR_SHIFT     30
 >> +extern bool pgtable_l4_enabled;
 >> +
 >> +#define PGDIR_SHIFT_L3  30
 >> +#define PGDIR_SHIFT_L4  39
 >> +#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
 >> +
 >> +#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : 
PGDIR_SHIFT_L3)
 >>   /* Size of region mapped by a page global directory */
 >>   #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
 >>   #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
 >>   +/* pud is folded into pgd in case of 3-level page table */
 >> +#define PUD_SHIFT      30
 >> +#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
 >> +#define PUD_MASK       (~(PUD_SIZE - 1))
 >> +
 >>   #define PMD_SHIFT       21
 >>   /* Size of region mapped by a page middle directory */
 >>   #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
 >>   #define PMD_MASK        (~(PMD_SIZE - 1))
 >>   +/* Page Upper Directory entry */
 >> +typedef struct {
 >> +    unsigned long pud;
 >> +} pud_t;
 >> +
 >> +#define pud_val(x)      ((x).pud)
 >> +#define __pud(x)        ((pud_t) { (x) })
 >> +#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
 >> +
 >>   /* Page Middle Directory entry */
 >>   typedef struct {
 >>       unsigned long pmd;
 >> @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
 >>       set_pud(pudp, __pud(0));
 >>   }
 >>   +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
 >> +{
 >> +    return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
 >> +}
 >> +
 >> +static inline unsigned long _pud_pfn(pud_t pud)
 >> +{
 >> +    return pud_val(pud) >> _PAGE_PFN_SHIFT;
 >> +}
 >> +
 >>   static inline pmd_t *pud_pgtable(pud_t pud)
 >>   {
 >>       return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
 >> @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
 >>       return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
 >>   }
 >>   +#define mm_pud_folded  mm_pud_folded
 >> +static inline bool mm_pud_folded(struct mm_struct *mm)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return false;
 >> +
 >> +    return true;
 >> +}
 >> +
 >> +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
 >> +
 >>   static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
 >>   {
 >>       return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
 >> @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
 >>   #define pmd_ERROR(e) \
 >>       pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
 >>   +#define pud_ERROR(e)   \
 >> +    pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
 >> +
 >> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        *p4dp = p4d;
 >> +    else
 >> +        set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
 >> +}
 >> +
 >> +static inline int p4d_none(p4d_t p4d)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return (p4d_val(p4d) == 0);
 >> +
 >> +    return 0;
 >> +}
 >> +
 >> +static inline int p4d_present(p4d_t p4d)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return (p4d_val(p4d) & _PAGE_PRESENT);
 >> +
 >> +    return 1;
 >> +}
 >> +
 >> +static inline int p4d_bad(p4d_t p4d)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return !p4d_present(p4d);
 >> +
 >> +    return 0;
 >> +}
 >> +
 >> +static inline void p4d_clear(p4d_t *p4d)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        set_p4d(p4d, __p4d(0));
 >> +}
 >> +
 >> +static inline pud_t *p4d_pgtable(p4d_t p4d)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
 >> +
 >> +    return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
 >> +}
 >> +
 >> +static inline struct page *p4d_page(p4d_t p4d)
 >> +{
 >> +    return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
 >> +}
 >> +
 >> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
 >> +
 >> +#define pud_offset pud_offset
 >> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 >> +{
 >> +    if (pgtable_l4_enabled)
 >> +        return p4d_pgtable(*p4d) + pud_index(address);
 >> +
 >> +    return (pud_t *)p4d;
 >> +}
 >> +
 >>   #endif /* _ASM_RISCV_PGTABLE_64_H */
 >> diff --git a/arch/riscv/include/asm/pgtable.h 
b/arch/riscv/include/asm/pgtable.h
 >> index e1a52e22ad7e..e1c74ef4ead2 100644
 >> --- a/arch/riscv/include/asm/pgtable.h
 >> +++ b/arch/riscv/include/asm/pgtable.h
 >> @@ -51,7 +51,7 @@
 >>    * position vmemmap directly below the VMALLOC region.
 >>    */
 >>   #ifdef CONFIG_64BIT
 >> -#define VA_BITS        39
 >> +#define VA_BITS        (pgtable_l4_enabled ? 48 : 39)
 >>   #else
 >>   #define VA_BITS        32
 >>   #endif
 >> @@ -90,8 +90,7 @@
 >>     #ifndef __ASSEMBLY__
 >>   -/* Page Upper Directory not used in RISC-V */
 >> -#include <asm-generic/pgtable-nopud.h>
 >> +#include <asm-generic/pgtable-nop4d.h>
 >>   #include <asm/page.h>
 >>   #include <asm/tlbflush.h>
 >>   #include <linux/mm_types.h>
 >> @@ -113,6 +112,17 @@
 >>   #define XIP_FIXUP(addr)        (addr)
 >>   #endif /* CONFIG_XIP_KERNEL */
 >>   +struct pt_alloc_ops {
 >> +    pte_t *(*get_pte_virt)(phys_addr_t pa);
 >> +    phys_addr_t (*alloc_pte)(uintptr_t va);
 >> +#ifndef __PAGETABLE_PMD_FOLDED
 >> +    pmd_t *(*get_pmd_virt)(phys_addr_t pa);
 >> +    phys_addr_t (*alloc_pmd)(uintptr_t va);
 >> +    pud_t *(*get_pud_virt)(phys_addr_t pa);
 >> +    phys_addr_t (*alloc_pud)(uintptr_t va);
 >> +#endif
 >> +};
 >> +
 >>   #ifdef CONFIG_MMU
 >>   /* Number of entries in the page global directory */
 >>   #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
 >> @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct 
vm_area_struct *vma,
 >>    * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
 >>    */
 >>   #ifdef CONFIG_64BIT
 >> -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
 >> +#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
 >> +#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
 >>   #else
 >> -#define TASK_SIZE FIXADDR_START
 >> +#define TASK_SIZE    FIXADDR_START
 >> +#define TASK_SIZE_MIN    TASK_SIZE
 >>   #endif
 >>     #else /* CONFIG_MMU */
 >> @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa;
 >>   #define dtb_early_va    _dtb_early_va
 >>   #define dtb_early_pa    _dtb_early_pa
 >>   #endif /* CONFIG_XIP_KERNEL */
 >> +extern u64 satp_mode;
 >> +extern bool pgtable_l4_enabled;
 >>     void paging_init(void);
 >>   void misc_mem_init(void);
 >> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
 >> index 52c5ff9804c5..c3c0ed559770 100644
 >> --- a/arch/riscv/kernel/head.S
 >> +++ b/arch/riscv/kernel/head.S
 >> @@ -95,7 +95,8 @@ relocate:
 >>         /* Compute satp for kernel page tables, but don't load it yet */
 >>       srl a2, a0, PAGE_SHIFT
 >> -    li a1, SATP_MODE
 >> +    la a1, satp_mode
 >> +    REG_L a1, 0(a1)
 >>       or a2, a2, a1
 >>         /*
 >> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
 >> index ee3459cb6750..a7246872bd30 100644
 >> --- a/arch/riscv/mm/context.c
 >> +++ b/arch/riscv/mm/context.c
 >> @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, 
unsigned int cpu)
 >>   switch_mm_fast:
 >>       csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
 >>             ((cntx & asid_mask) << SATP_ASID_SHIFT) |
 >> -          SATP_MODE);
 >> +          satp_mode);
 >>         if (need_flush_tlb)
 >>           local_flush_tlb_all();
 >> @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, 
unsigned int cpu)
 >>   static void set_mm_noasid(struct mm_struct *mm)
 >>   {
 >>       /* Switch the page table and blindly nuke entire local TLB */
 >> -    csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
 >> +    csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
 >>       local_flush_tlb_all();
 >>   }
 >>   diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
 >> index 1552226fb6bd..6a19a1b1caf8 100644
 >> --- a/arch/riscv/mm/init.c
 >> +++ b/arch/riscv/mm/init.c
 >> @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map);
 >>   #define kernel_map    (*(struct kernel_mapping 
*)XIP_FIXUP(&kernel_map))
 >>   #endif
 >>   +#ifdef CONFIG_64BIT
 >> +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : 
SATP_MODE_39;
 >> +#else
 >> +u64 satp_mode = SATP_MODE_32;
 >> +#endif
 >> +EXPORT_SYMBOL(satp_mode);
 >> +
 >> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && 
!IS_ENABLED(CONFIG_XIP_KERNEL) ?
 >> +                true : false;
 >> +EXPORT_SYMBOL(pgtable_l4_enabled);
 >> +
 >>   phys_addr_t phys_ram_base __ro_after_init;
 >>   EXPORT_SYMBOL(phys_ram_base);
 >>   @@ -53,15 +64,6 @@ extern char _start[];
 >>   void *_dtb_early_va __initdata;
 >>   uintptr_t _dtb_early_pa __initdata;
 >>   -struct pt_alloc_ops {
 >> -    pte_t *(*get_pte_virt)(phys_addr_t pa);
 >> -    phys_addr_t (*alloc_pte)(uintptr_t va);
 >> -#ifndef __PAGETABLE_PMD_FOLDED
 >> -    pmd_t *(*get_pmd_virt)(phys_addr_t pa);
 >> -    phys_addr_t (*alloc_pmd)(uintptr_t va);
 >> -#endif
 >> -};
 >> -
 >>   static phys_addr_t dma32_phys_limit __initdata;
 >>     static void __init zone_sizes_init(void)
 >> @@ -222,7 +224,7 @@ static void __init setup_bootmem(void)
 >>   }
 >>     #ifdef CONFIG_MMU
 >> -static struct pt_alloc_ops _pt_ops __initdata;
 >> +struct pt_alloc_ops _pt_ops __initdata;
 >>     #ifdef CONFIG_XIP_KERNEL
 >>   #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
 >> @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] 
__page_aligned_bss;
 >>   static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
 >>     pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
 >> +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata 
__aligned(PAGE_SIZE);
 >>   static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata 
__aligned(PAGE_SIZE);
 >>     #ifdef CONFIG_XIP_KERNEL
 >> @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata 
__aligned(PAGE_SIZE);
 >>   #define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
 >>   #endif /* CONFIG_XIP_KERNEL */
 >>   +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
 >> +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
 >> +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
 >> +
 >> +#ifdef CONFIG_XIP_KERNEL
 >> +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud))
 >> +#define fixmap_pud     ((pud_t *)XIP_FIXUP(fixmap_pud))
 >> +#define early_pud      ((pud_t *)XIP_FIXUP(early_pud))
 >> +#endif /* CONFIG_XIP_KERNEL */
 >> +
 >>   static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
 >>   {
 >>       /* Before MMU is enabled */
 >> @@ -345,7 +358,7 @@ static pmd_t *__init 
get_pmd_virt_late(phys_addr_t pa)
 >>     static phys_addr_t __init alloc_pmd_early(uintptr_t va)
 >>   {
 >> -    BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
 >> +    BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
 >>         return (uintptr_t)early_pmd;
 >>   }
 >> @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
 >>       create_pte_mapping(ptep, va, pa, sz, prot);
 >>   }
 >>   -#define pgd_next_t        pmd_t
 >> -#define alloc_pgd_next(__va)    pt_ops.alloc_pmd(__va)
 >> -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa)
 >> +static pud_t *__init get_pud_virt_early(phys_addr_t pa)
 >> +{
 >> +    return (pud_t *)((uintptr_t)pa);
 >> +}
 >> +
 >> +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
 >> +{
 >> +    clear_fixmap(FIX_PUD);
 >> +    return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
 >> +}
 >> +
 >> +static pud_t *__init get_pud_virt_late(phys_addr_t pa)
 >> +{
 >> +    return (pud_t *)__va(pa);
 >> +}
 >> +
 >> +static phys_addr_t __init alloc_pud_early(uintptr_t va)
 >> +{
 >> +    /* Only one PUD is available for early mapping */
 >> +    BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
 >> +
 >> +    return (uintptr_t)early_pud;
 >> +}
 >> +
 >> +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
 >> +{
 >> +    return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 >> +}
 >> +
 >> +static phys_addr_t alloc_pud_late(uintptr_t va)
 >> +{
 >> +    unsigned long vaddr;
 >> +
 >> +    vaddr = __get_free_page(GFP_KERNEL);
 >> +    BUG_ON(!vaddr);
 >> +    return __pa(vaddr);
 >> +}
 >> +
 >> +static void __init create_pud_mapping(pud_t *pudp,
 >> +                      uintptr_t va, phys_addr_t pa,
 >> +                      phys_addr_t sz, pgprot_t prot)
 >> +{
 >> +    pmd_t *nextp;
 >> +    phys_addr_t next_phys;
 >> +    uintptr_t pud_index = pud_index(va);
 >> +
 >> +    if (sz == PUD_SIZE) {
 >> +        if (pud_val(pudp[pud_index]) == 0)
 >> +            pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
 >> +        return;
 >> +    }
 >> +
 >> +    if (pud_val(pudp[pud_index]) == 0) {
 >> +        next_phys = pt_ops.alloc_pmd(va);
 >> +        pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
 >> +        nextp = pt_ops.get_pmd_virt(next_phys);
 >> +        memset(nextp, 0, PAGE_SIZE);
 >> +    } else {
 >> +        next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
 >> +        nextp = pt_ops.get_pmd_virt(next_phys);
 >> +    }
 >> +
 >> +    create_pmd_mapping(nextp, va, pa, sz, prot);
 >> +}
 >> +
 >> +#define pgd_next_t        pud_t
 >> +#define alloc_pgd_next(__va)    (pgtable_l4_enabled ?            \
 >> +        pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))
 >> +#define get_pgd_next_virt(__pa)    (pgtable_l4_enabled ?            \
 >> +        pt_ops.get_pud_virt(__pa) : (pgd_next_t 
*)pt_ops.get_pmd_virt(__pa))
 >>   #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, 
__prot)    \
 >> -    create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
 >> -#define fixmap_pgd_next        fixmap_pmd
 >> +                (pgtable_l4_enabled ?            \
 >> +        create_pud_mapping(__nextp, __va, __pa, __sz, __prot) :    \
 >> +        create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))
 >> +#define fixmap_pgd_next        (pgtable_l4_enabled ?            \
 >> +        (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
 >> +#define trampoline_pgd_next    (pgtable_l4_enabled ?            \
 >> +        (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
 >> +#define early_dtb_pgd_next    (pgtable_l4_enabled ?            \
 >> +        (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)
 >>   #else
 >>   #define pgd_next_t        pte_t
 >>   #define alloc_pgd_next(__va)    pt_ops.alloc_pte(__va)
 >>   #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa)
 >>   #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, 
__prot)    \
 >>       create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
 >> -#define fixmap_pgd_next        fixmap_pte
 >> +#define fixmap_pgd_next        ((uintptr_t)fixmap_pte)
 >> +#define early_dtb_pgd_next    ((uintptr_t)early_dtb_pmd)
 >> +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot)
 >>   #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot)
 >> -#endif
 >> +#endif /* __PAGETABLE_PMD_FOLDED */
 >>     void __init create_pgd_mapping(pgd_t *pgdp,
 >>                         uintptr_t va, phys_addr_t pa,
 >> @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va)
 >>   }
 >>   #endif /* CONFIG_STRICT_KERNEL_RWX */
 >>   +#ifdef CONFIG_64BIT
 >> +static void __init disable_pgtable_l4(void)
 >> +{
 >> +    pgtable_l4_enabled = false;
 >> +    kernel_map.page_offset = PAGE_OFFSET_L3;
 >> +    satp_mode = SATP_MODE_39;
 >> +}
 >> +
 >> +/*
 >> + * There is a simple way to determine if 4-level is supported by the
 >> + * underlying hardware: establish 1:1 mapping in 4-level page table 
mode
 >> + * then read SATP to see if the configuration was taken into account
 >> + * meaning sv48 is supported.
 >> + */
 >> +static __init void set_satp_mode(void)
 >> +{
 >> +    u64 identity_satp, hw_satp;
 >> +    uintptr_t set_satp_mode_pmd;
 >> +
 >> +    set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
 >> +    create_pgd_mapping(early_pg_dir,
 >> +               set_satp_mode_pmd, (uintptr_t)early_pud,
 >> +               PGDIR_SIZE, PAGE_TABLE);
 >> +    create_pud_mapping(early_pud,
 >> +               set_satp_mode_pmd, (uintptr_t)early_pmd,
 >> +               PUD_SIZE, PAGE_TABLE);
 >> +    /* Handle the case where set_satp_mode straddles 2 PMDs */
 >> +    create_pmd_mapping(early_pmd,
 >> +               set_satp_mode_pmd, set_satp_mode_pmd,
 >> +               PMD_SIZE, PAGE_KERNEL_EXEC);
 >> +    create_pmd_mapping(early_pmd,
 >> +               set_satp_mode_pmd + PMD_SIZE,
 >> +               set_satp_mode_pmd + PMD_SIZE,
 >> +               PMD_SIZE, PAGE_KERNEL_EXEC);
 >> +
 >> +    identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
 >> +
 >> +    local_flush_tlb_all();
 >> +    csr_write(CSR_SATP, identity_satp);
 >> +    hw_satp = csr_swap(CSR_SATP, 0ULL);
 >> +    local_flush_tlb_all();
 >> +
 >> +    if (hw_satp != identity_satp)
 >> +        disable_pgtable_l4();
 >> +
 >> +    memset(early_pg_dir, 0, PAGE_SIZE);
 >> +    memset(early_pud, 0, PAGE_SIZE);
 >> +    memset(early_pmd, 0, PAGE_SIZE);
 >> +}
 >> +#endif
 >> +
 >>   /*
 >>    * setup_vm() is called from head.S with MMU-off.
 >>    *
 >> @@ -557,10 +697,15 @@ static void __init 
create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
 >>       uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1);
 >>         create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA,
 >> -               IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd 
: pa,
 >> +               IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa,
 >>                  PGDIR_SIZE,
 >>                  IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL);
 >>   +    if (pgtable_l4_enabled) {
 >> +        create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA,
 >> +                   (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE);
 >> +    }
 >> +
 >>       if (IS_ENABLED(CONFIG_64BIT)) {
 >>           create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
 >>                      pa, PMD_SIZE, PAGE_KERNEL);
 >> @@ -593,6 +738,8 @@ void pt_ops_set_early(void)
 >>   #ifndef __PAGETABLE_PMD_FOLDED
 >>       pt_ops.alloc_pmd = alloc_pmd_early;
 >>       pt_ops.get_pmd_virt = get_pmd_virt_early;
 >> +    pt_ops.alloc_pud = alloc_pud_early;
 >> +    pt_ops.get_pud_virt = get_pud_virt_early;
 >>   #endif
 >>   }
 >>   @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void)
 >>   #ifndef __PAGETABLE_PMD_FOLDED
 >>       pt_ops.alloc_pmd = 
kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap);
 >>       pt_ops.get_pmd_virt = 
kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap);
 >> +    pt_ops.alloc_pud = 
kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap);
 >> +    pt_ops.get_pud_virt = 
kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap);
 >>   #endif
 >>   }
 >>   @@ -625,6 +774,8 @@ void pt_ops_set_late(void)
 >>   #ifndef __PAGETABLE_PMD_FOLDED
 >>       pt_ops.alloc_pmd = alloc_pmd_late;
 >>       pt_ops.get_pmd_virt = get_pmd_virt_late;
 >> +    pt_ops.alloc_pud = alloc_pud_late;
 >> +    pt_ops.get_pud_virt = get_pud_virt_late;
 >>   #endif
 >>   }
 >>   @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 >>       pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd;
 >>         kernel_map.virt_addr = KERNEL_LINK_ADDR;
 >> +    kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
 >>     #ifdef CONFIG_XIP_KERNEL
 >>       kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
 >> @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 >>       kernel_map.phys_addr = (uintptr_t)(&_start);
 >>       kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
 >>   #endif
 >> +
 >> +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
 >> +    set_satp_mode();
 >> +#endif
 >> +
 >>       kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
 >>       kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - 
kernel_map.phys_addr;
 >>   @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t 
dtb_pa)
 >>         /* Setup early PGD for fixmap */
 >>       create_pgd_mapping(early_pg_dir, FIXADDR_START,
 >> -               (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
 >> +               fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
 >>     #ifndef __PAGETABLE_PMD_FOLDED
 >> -    /* Setup fixmap PMD */
 >> +    /* Setup fixmap PUD and PMD */
 >> +    if (pgtable_l4_enabled)
 >> +        create_pud_mapping(fixmap_pud, FIXADDR_START,
 >> +                   (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
 >>       create_pmd_mapping(fixmap_pmd, FIXADDR_START,
 >>                  (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
 >>       /* Setup trampoline PGD and PMD */
 >>       create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
 >> -               (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
 >> +               trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
 >> +    if (pgtable_l4_enabled)
 >> +        create_pud_mapping(trampoline_pud, kernel_map.virt_addr,
 >> +                   (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
 >>   #ifdef CONFIG_XIP_KERNEL
 >>       create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
 >>                  kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
 >> @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 >>        * Bootime fixmap only can handle PMD_SIZE mapping. Thus, 
boot-ioremap
 >>        * range can not span multiple pmds.
 >>        */
 >> -    BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
 >> +    BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
 >>                != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
 >>     #ifndef __PAGETABLE_PMD_FOLDED
 >> @@ -783,9 +946,10 @@ static void __init setup_vm_final(void)
 >>       /* Clear fixmap PTE and PMD mappings */
 >>       clear_fixmap(FIX_PTE);
 >>       clear_fixmap(FIX_PMD);
 >> +    clear_fixmap(FIX_PUD);
 >>         /* Move to swapper page table */
 >> -    csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | 
SATP_MODE);
 >> +    csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | 
satp_mode);
 >>       local_flush_tlb_all();
 >>         pt_ops_set_late();
 >> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
 >> index 1434a0225140..993f50571a3b 100644
 >> --- a/arch/riscv/mm/kasan_init.c
 >> +++ b/arch/riscv/mm/kasan_init.c
 >> @@ -11,7 +11,29 @@
 >>   #include <asm/fixmap.h>
 >>   #include <asm/pgalloc.h>
 >>   +/*
 >> + * Kasan shadow region must lie at a fixed address across sv39, 
sv48 and sv57
 >> + * which is right before the kernel.
 >> + *
 >> + * For sv39, the region is aligned on PGDIR_SIZE so we only need to 
populate
 >> + * the page global directory with kasan_early_shadow_pmd.
 >> + *
 >> + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so 
the mapping
 >> + * must be divided as follows:
 >> + * - the first PGD entry, although incomplete, is populated with
 >> + *   kasan_early_shadow_pud/p4d
 >> + * - the PGD entries in the middle are populated with 
kasan_early_shadow_pud/p4d
 >> + * - the last PGD entry is shared with the kernel mapping so 
populated at the
 >> + *   lower levels pud/p4d
 >> + *
 >> + * In addition, when shallow populating a kasan region (for example 
vmalloc),
 >> + * this region may also not be aligned on PGDIR size, so we must go 
down to the
 >> + * pud level too.
 >> + */
 >> +
 >>   extern pgd_t early_pg_dir[PTRS_PER_PGD];
 >> +extern struct pt_alloc_ops _pt_ops __initdata;
 >> +#define pt_ops    _pt_ops
 >>     static void __init kasan_populate_pte(pmd_t *pmd, unsigned long 
vaddr, unsigned long end)
 >>   {
 >> @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t 
*pmd, unsigned long vaddr, unsigned
 >>       set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
 >>   }
 >>   -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long 
vaddr, unsigned long end)
 >> +static void __init kasan_populate_pmd(pud_t *pud, unsigned long 
vaddr, unsigned long end)
 >>   {
 >>       phys_addr_t phys_addr;
 >>       pmd_t *pmdp, *base_pmd;
 >>       unsigned long next;
 >>   -    base_pmd = (pmd_t *)pgd_page_vaddr(*pgd);
 >> -    if (base_pmd == lm_alias(kasan_early_shadow_pmd))
 >> +    if (pud_none(*pud)) {
 >>           base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), 
PAGE_SIZE);
 >> +    } else {
 >> +        base_pmd = (pmd_t *)pud_pgtable(*pud);
 >> +        if (base_pmd == lm_alias(kasan_early_shadow_pmd))
 >> +            base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), 
PAGE_SIZE);
 >> +    }
 >>         pmdp = base_pmd + pmd_index(vaddr);
 >>   @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t 
*pgd, unsigned long vaddr, unsigned
 >>        * it entirely, memblock could allocate a page at a physical 
address
 >>        * where KASAN is not populated yet and then we'd get a page 
fault.
 >>        */
 >> -    set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
 >> +    set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
 >> +}
 >> +
 >> +static void __init kasan_populate_pud(pgd_t *pgd,
 >> +                      unsigned long vaddr, unsigned long end,
 >> +                      bool early)
 >> +{
 >> +    phys_addr_t phys_addr;
 >> +    pud_t *pudp, *base_pud;
 >> +    unsigned long next;
 >> +
 >> +    if (early) {
 >> +        /*
 >> +         * We can't use pgd_page_vaddr here as it would return a linear
 >> +         * mapping address but it is not mapped yet, but when 
populating
 >> +         * early_pg_dir, we need the physical address and when 
populating
 >> +         * swapper_pg_dir, we need the kernel virtual address so use
 >> +         * pt_ops facility.
 >> +         */
 >> +        base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd)));
 >> +    } else {
 >> +        base_pud = (pud_t *)pgd_page_vaddr(*pgd);
 >> +        if (base_pud == lm_alias(kasan_early_shadow_pud))
 >> +            base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), 
PAGE_SIZE);
 >> +    }
 >> +
 >> +    pudp = base_pud + pud_index(vaddr);
 >> +
 >> +    do {
 >> +        next = pud_addr_end(vaddr, end);
 >> +
 >> +        if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next 
- vaddr) >= PUD_SIZE) {
 >> +            if (early) {
 >> +                phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd));
 >> +                set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), 
PAGE_TABLE));
 >> +                continue;
 >> +            } else {
 >> +                phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE);
 >> +                if (phys_addr) {
 >> +                    set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), 
PAGE_KERNEL));
 >> +                    continue;
 >> +                }
 >> +            }
 >> +        }
 >> +
 >> +        kasan_populate_pmd(pudp, vaddr, next);
 >> +    } while (pudp++, vaddr = next, vaddr != end);
 >> +
 >> +    /*
 >> +     * Wait for the whole PGD to be populated before setting the PGD in
 >> +     * the page table, otherwise, if we did set the PGD before 
populating
 >> +     * it entirely, memblock could allocate a page at a physical 
address
 >> +     * where KASAN is not populated yet and then we'd get a page fault.
 >> +     */
 >> +    if (!early)
 >> +        set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE));
 >>   }
 >>   +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ?    \
 >> +                (uintptr_t)kasan_early_shadow_pud : \
 >> +                (uintptr_t)kasan_early_shadow_pmd)
 >> +#define kasan_populate_pgd_next(pgdp, vaddr, next, early)            \
 >> +        (pgtable_l4_enabled ?                        \
 >> +            kasan_populate_pud(pgdp, vaddr, next, early) :        \
 >> +            kasan_populate_pmd((pud_t *)pgdp, vaddr, next))
 >> +
 >>   static void __init kasan_populate_pgd(pgd_t *pgdp,
 >>                         unsigned long vaddr, unsigned long end,
 >>                         bool early)
 >> @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp,
 >>               }
 >>           }
 >>   -        kasan_populate_pmd(pgdp, vaddr, next);
 >> +        kasan_populate_pgd_next(pgdp, vaddr, next, early);
 >>       } while (pgdp++, vaddr = next, vaddr != end);
 >>   }
 >>   @@ -157,18 +246,54 @@ static void __init kasan_populate(void 
*start, void *end)
 >>       memset(start, KASAN_SHADOW_INIT, end - start);
 >>   }
 >>   +static void __init kasan_shallow_populate_pud(pgd_t *pgdp,
 >> +                          unsigned long vaddr, unsigned long end,
 >> +                          bool kasan_populate)
 >> +{
 >> +    unsigned long next;
 >> +    pud_t *pudp, *base_pud;
 >> +    pmd_t *base_pmd;
 >> +    bool is_kasan_pmd;
 >> +
 >> +    base_pud = (pud_t *)pgd_page_vaddr(*pgdp);
 >> +    pudp = base_pud + pud_index(vaddr);
 >> +
 >> +    if (kasan_populate)
 >> +        memcpy(base_pud, (void *)kasan_early_shadow_pgd_next,
 >> +               sizeof(pud_t) * PTRS_PER_PUD);
 >> +
 >> +    do {
 >> +        next = pud_addr_end(vaddr, end);
 >> +        is_kasan_pmd = (pud_pgtable(*pudp) == 
lm_alias(kasan_early_shadow_pmd));
 >> +
 >> +        if (is_kasan_pmd) {
 >> +            base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 >> +            set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), 
PAGE_TABLE));
 >> +        }
 >> +    } while (pudp++, vaddr = next, vaddr != end);
 >> +}
 >> +
 >>   static void __init kasan_shallow_populate_pgd(unsigned long vaddr, 
unsigned long end)
 >>   {
 >>       unsigned long next;
 >>       void *p;
 >>       pgd_t *pgd_k = pgd_offset_k(vaddr);
 >> +    bool is_kasan_pgd_next;
 >>         do {
 >>           next = pgd_addr_end(vaddr, end);
 >> -        if (pgd_page_vaddr(*pgd_k) == (unsigned 
long)lm_alias(kasan_early_shadow_pmd)) {
 >> +        is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) ==
 >> +                     (unsigned 
long)lm_alias(kasan_early_shadow_pgd_next));
 >> +
 >> +        if (is_kasan_pgd_next) {
 >>               p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 >>               set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
 >>           }
 >> +
 >> +        if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= 
PGDIR_SIZE)
 >> +            continue;
 >> +
 >> +        kasan_shallow_populate_pud(pgd_k, vaddr, next, 
is_kasan_pgd_next);
 >>       } while (pgd_k++, vaddr = next, vaddr != end);
 >>   }
 >
 >
 > @Qinglin: I can deal with sv57 kasan population if needs be as it is 
a bit tricky and I think it would save you quite some time :)

Thanks so much for you suggestion! And I want to give it a try firstly 
as I am now making new Sv57 patchset :) I will ask for your help when I 
meet any trouble, and thanks again!

Yours,
Qinglin

 >
 >
 >>   diff --git a/drivers/firmware/efi/libstub/efi-stub.c 
b/drivers/firmware/efi/libstub/efi-stub.c
 >> index 26e69788f27a..b3db5d91ed38 100644
 >> --- a/drivers/firmware/efi/libstub/efi-stub.c
 >> +++ b/drivers/firmware/efi/libstub/efi-stub.c
 >> @@ -40,6 +40,8 @@
 >>     #ifdef CONFIG_ARM64
 >>   # define EFI_RT_VIRTUAL_LIMIT    DEFAULT_MAP_WINDOW_64
 >> +#elif defined(CONFIG_RISCV)
 >> +# define EFI_RT_VIRTUAL_LIMIT    TASK_SIZE_MIN
 >>   #else
 >>   # define EFI_RT_VIRTUAL_LIMIT    TASK_SIZE
 >>   #endif
Jisheng Zhang Dec. 26, 2021, 8:59 a.m. UTC | #3
On Mon,  6 Dec 2021 11:46:51 +0100
Alexandre Ghiti <alexandre.ghiti@canonical.com> wrote:

> By adding a new 4th level of page table, give the possibility to 64bit
> kernel to address 2^48 bytes of virtual address: in practice, that offers
> 128TB of virtual address space to userspace and allows up to 64TB of
> physical memory.
> 
> If the underlying hardware does not support sv48, we will automatically
> fallback to a standard 3-level page table by folding the new PUD level into
> PGDIR level. In order to detect HW capabilities at runtime, we
> use SATP feature that ignores writes with an unsupported mode.
> 
> Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
> ---
>  arch/riscv/Kconfig                      |   4 +-
>  arch/riscv/include/asm/csr.h            |   3 +-
>  arch/riscv/include/asm/fixmap.h         |   1 +
>  arch/riscv/include/asm/kasan.h          |   6 +-
>  arch/riscv/include/asm/page.h           |  14 ++
>  arch/riscv/include/asm/pgalloc.h        |  40 +++++
>  arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
>  arch/riscv/include/asm/pgtable.h        |  24 ++-
>  arch/riscv/kernel/head.S                |   3 +-
>  arch/riscv/mm/context.c                 |   4 +-
>  arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
>  arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
>  drivers/firmware/efi/libstub/efi-stub.c |   2 +
>  13 files changed, 514 insertions(+), 44 deletions(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index ac6c0cd9bc29..d28fe0148e13 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -150,7 +150,7 @@ config PAGE_OFFSET
>  	hex
>  	default 0xC0000000 if 32BIT
>  	default 0x80000000 if 64BIT && !MMU
> -	default 0xffffffd800000000 if 64BIT
> +	default 0xffffaf8000000000 if 64BIT
>  
>  config KASAN_SHADOW_OFFSET
>  	hex
> @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM
>  
>  config PGTABLE_LEVELS
>  	int
> -	default 3 if 64BIT
> +	default 4 if 64BIT
>  	default 2
>  
>  config LOCKDEP_SUPPORT
> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> index 87ac65696871..3fdb971c7896 100644
> --- a/arch/riscv/include/asm/csr.h
> +++ b/arch/riscv/include/asm/csr.h
> @@ -40,14 +40,13 @@
>  #ifndef CONFIG_64BIT
>  #define SATP_PPN	_AC(0x003FFFFF, UL)
>  #define SATP_MODE_32	_AC(0x80000000, UL)
> -#define SATP_MODE	SATP_MODE_32
>  #define SATP_ASID_BITS	9
>  #define SATP_ASID_SHIFT	22
>  #define SATP_ASID_MASK	_AC(0x1FF, UL)
>  #else
>  #define SATP_PPN	_AC(0x00000FFFFFFFFFFF, UL)
>  #define SATP_MODE_39	_AC(0x8000000000000000, UL)
> -#define SATP_MODE	SATP_MODE_39
> +#define SATP_MODE_48	_AC(0x9000000000000000, UL)
>  #define SATP_ASID_BITS	16
>  #define SATP_ASID_SHIFT	44
>  #define SATP_ASID_MASK	_AC(0xFFFF, UL)
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index 54cbf07fb4e9..58a718573ad6 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -24,6 +24,7 @@ enum fixed_addresses {
>  	FIX_HOLE,
>  	FIX_PTE,
>  	FIX_PMD,
> +	FIX_PUD,
>  	FIX_TEXT_POKE1,
>  	FIX_TEXT_POKE0,
>  	FIX_EARLYCON_MEM_BASE,
> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> index 743e6ff57996..0b85e363e778 100644
> --- a/arch/riscv/include/asm/kasan.h
> +++ b/arch/riscv/include/asm/kasan.h
> @@ -28,7 +28,11 @@
>  #define KASAN_SHADOW_SCALE_SHIFT	3
>  
>  #define KASAN_SHADOW_SIZE	(UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
> -#define KASAN_SHADOW_START	(KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
> +/*
> + * Depending on the size of the virtual address space, the region may not be
> + * aligned on PGDIR_SIZE, so force its alignment to ease its population.
> + */
> +#define KASAN_SHADOW_START	((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
>  #define KASAN_SHADOW_END	MODULES_LOWEST_VADDR
>  #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
>  
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index e03559f9b35e..d089fe46f7d8 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -31,7 +31,20 @@
>   * When not using MMU this corresponds to the first free page in
>   * physical memory (aligned on a page boundary).
>   */
> +#ifdef CONFIG_64BIT
> +#ifdef CONFIG_MMU
> +#define PAGE_OFFSET		kernel_map.page_offset
> +#else
> +#define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
> +#endif
> +/*
> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> + * define the PAGE_OFFSET value for SV39.
> + */
> +#define PAGE_OFFSET_L3		_AC(0xffffffd800000000, UL)
> +#else
>  #define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
> +#endif /* CONFIG_64BIT */
>  
>  /*
>   * Half of the kernel address space (half of the entries of the page global
> @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base;
>  #endif /* CONFIG_MMU */
>  
>  struct kernel_mapping {
> +	unsigned long page_offset;
>  	unsigned long virt_addr;
>  	uintptr_t phys_addr;
>  	uintptr_t size;
> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> index 0af6933a7100..11823004b87a 100644
> --- a/arch/riscv/include/asm/pgalloc.h
> +++ b/arch/riscv/include/asm/pgalloc.h
> @@ -11,6 +11,8 @@
>  #include <asm/tlb.h>
>  
>  #ifdef CONFIG_MMU
> +#define __HAVE_ARCH_PUD_ALLOC_ONE
> +#define __HAVE_ARCH_PUD_FREE
>  #include <asm-generic/pgalloc.h>
>  
>  static inline void pmd_populate_kernel(struct mm_struct *mm,
> @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
>  
>  	set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
>  }
> +
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> +{
> +	if (pgtable_l4_enabled) {
> +		unsigned long pfn = virt_to_pfn(pud);
> +
> +		set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> +	}
> +}
> +
> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> +				     pud_t *pud)
> +{
> +	if (pgtable_l4_enabled) {
> +		unsigned long pfn = virt_to_pfn(pud);
> +
> +		set_p4d_safe(p4d,
> +			     __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> +	}
> +}
> +
> +#define pud_alloc_one pud_alloc_one
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> +	if (pgtable_l4_enabled)
> +		return __pud_alloc_one(mm, addr);
> +
> +	return NULL;
> +}
> +
> +#define pud_free pud_free
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> +	if (pgtable_l4_enabled)
> +		__pud_free(mm, pud);
> +}
> +
> +#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
>  #endif /* __PAGETABLE_PMD_FOLDED */
>  
>  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index 228261aa9628..bbbdd66e5e2f 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -8,16 +8,36 @@
>  
>  #include <linux/const.h>
>  
> -#define PGDIR_SHIFT     30
> +extern bool pgtable_l4_enabled;
> +
> +#define PGDIR_SHIFT_L3  30
> +#define PGDIR_SHIFT_L4  39
> +#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
> +
> +#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
>  /* Size of region mapped by a page global directory */
>  #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
>  #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
>  
> +/* pud is folded into pgd in case of 3-level page table */
> +#define PUD_SHIFT      30
> +#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
> +#define PUD_MASK       (~(PUD_SIZE - 1))
> +
>  #define PMD_SHIFT       21
>  /* Size of region mapped by a page middle directory */
>  #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
>  #define PMD_MASK        (~(PMD_SIZE - 1))
>  
> +/* Page Upper Directory entry */
> +typedef struct {
> +	unsigned long pud;
> +} pud_t;
> +
> +#define pud_val(x)      ((x).pud)
> +#define __pud(x)        ((pud_t) { (x) })
> +#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
> +
>  /* Page Middle Directory entry */
>  typedef struct {
>  	unsigned long pmd;
> @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
>  	set_pud(pudp, __pud(0));
>  }
>  
> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> +{
> +	return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> +}
> +
> +static inline unsigned long _pud_pfn(pud_t pud)
> +{
> +	return pud_val(pud) >> _PAGE_PFN_SHIFT;
> +}
> +
>  static inline pmd_t *pud_pgtable(pud_t pud)
>  {
>  	return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
>  	return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
>  }
>  
> +#define mm_pud_folded  mm_pud_folded
> +static inline bool mm_pud_folded(struct mm_struct *mm)
> +{
> +	if (pgtable_l4_enabled)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
> +
>  static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
>  {
>  	return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
>  #define pmd_ERROR(e) \
>  	pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
>  
> +#define pud_ERROR(e)   \
> +	pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> +
> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		*p4dp = p4d;
> +	else
> +		set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> +}
> +
> +static inline int p4d_none(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return (p4d_val(p4d) == 0);
> +
> +	return 0;
> +}
> +
> +static inline int p4d_present(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return (p4d_val(p4d) & _PAGE_PRESENT);
> +
> +	return 1;
> +}
> +
> +static inline int p4d_bad(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return !p4d_present(p4d);
> +
> +	return 0;
> +}
> +
> +static inline void p4d_clear(p4d_t *p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		set_p4d(p4d, __p4d(0));
> +}
> +
> +static inline pud_t *p4d_pgtable(p4d_t p4d)
> +{
> +	if (pgtable_l4_enabled)
> +		return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +
> +	return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
> +}
> +
> +static inline struct page *p4d_page(p4d_t p4d)
> +{
> +	return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +}
> +
> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> +
> +#define pud_offset pud_offset
> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +{
> +	if (pgtable_l4_enabled)
> +		return p4d_pgtable(*p4d) + pud_index(address);
> +
> +	return (pud_t *)p4d;
> +}
> +
>  #endif /* _ASM_RISCV_PGTABLE_64_H */
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index e1a52e22ad7e..e1c74ef4ead2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -51,7 +51,7 @@
>   * position vmemmap directly below the VMALLOC region.
>   */
>  #ifdef CONFIG_64BIT
> -#define VA_BITS		39
> +#define VA_BITS		(pgtable_l4_enabled ? 48 : 39)
>  #else
>  #define VA_BITS		32
>  #endif
> @@ -90,8 +90,7 @@
>  
>  #ifndef __ASSEMBLY__
>  
> -/* Page Upper Directory not used in RISC-V */
> -#include <asm-generic/pgtable-nopud.h>
> +#include <asm-generic/pgtable-nop4d.h>
>  #include <asm/page.h>
>  #include <asm/tlbflush.h>
>  #include <linux/mm_types.h>
> @@ -113,6 +112,17 @@
>  #define XIP_FIXUP(addr)		(addr)
>  #endif /* CONFIG_XIP_KERNEL */
>  
> +struct pt_alloc_ops {
> +	pte_t *(*get_pte_virt)(phys_addr_t pa);
> +	phys_addr_t (*alloc_pte)(uintptr_t va);
> +#ifndef __PAGETABLE_PMD_FOLDED
> +	pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> +	phys_addr_t (*alloc_pmd)(uintptr_t va);
> +	pud_t *(*get_pud_virt)(phys_addr_t pa);
> +	phys_addr_t (*alloc_pud)(uintptr_t va);
> +#endif
> +};
> +
>  #ifdef CONFIG_MMU
>  /* Number of entries in the page global directory */
>  #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
> @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
>   * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
>   */
>  #ifdef CONFIG_64BIT
> -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
> +#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
> +#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
>  #else
> -#define TASK_SIZE FIXADDR_START
> +#define TASK_SIZE	FIXADDR_START
> +#define TASK_SIZE_MIN	TASK_SIZE
>  #endif
>  
>  #else /* CONFIG_MMU */
> @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa;
>  #define dtb_early_va	_dtb_early_va
>  #define dtb_early_pa	_dtb_early_pa
>  #endif /* CONFIG_XIP_KERNEL */
> +extern u64 satp_mode;
> +extern bool pgtable_l4_enabled;
>  
>  void paging_init(void);
>  void misc_mem_init(void);
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 52c5ff9804c5..c3c0ed559770 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -95,7 +95,8 @@ relocate:
>  
>  	/* Compute satp for kernel page tables, but don't load it yet */
>  	srl a2, a0, PAGE_SHIFT
> -	li a1, SATP_MODE
> +	la a1, satp_mode
> +	REG_L a1, 0(a1)
>  	or a2, a2, a1
>  
>  	/*
> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> index ee3459cb6750..a7246872bd30 100644
> --- a/arch/riscv/mm/context.c
> +++ b/arch/riscv/mm/context.c
> @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
>  switch_mm_fast:
>  	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
>  		  ((cntx & asid_mask) << SATP_ASID_SHIFT) |
> -		  SATP_MODE);
> +		  satp_mode);
>  
>  	if (need_flush_tlb)
>  		local_flush_tlb_all();
> @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
>  static void set_mm_noasid(struct mm_struct *mm)
>  {
>  	/* Switch the page table and blindly nuke entire local TLB */
> -	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
> +	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
>  	local_flush_tlb_all();
>  }
>  
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 1552226fb6bd..6a19a1b1caf8 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map);
>  #define kernel_map	(*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
>  #endif
>  
> +#ifdef CONFIG_64BIT
> +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
> +#else
> +u64 satp_mode = SATP_MODE_32;
> +#endif
> +EXPORT_SYMBOL(satp_mode);
> +
> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ?
> +				true : false;

Hi Alex,

I'm not sure whether we can use static key for pgtable_l4_enabled or
not. Obviously, for a specific HW platform, pgtable_l4_enabled won't change
after boot, and it seems it sits hot code path, so IMHO, static key maybe
suitable for it.

Thanks
Guo Ren Dec. 29, 2021, 3:42 a.m. UTC | #4
On Tue, Dec 7, 2021 at 11:54 AM Alexandre Ghiti
<alexandre.ghiti@canonical.com> wrote:
>
> By adding a new 4th level of page table, give the possibility to 64bit
> kernel to address 2^48 bytes of virtual address: in practice, that offers
> 128TB of virtual address space to userspace and allows up to 64TB of
> physical memory.
>
> If the underlying hardware does not support sv48, we will automatically
> fallback to a standard 3-level page table by folding the new PUD level into
> PGDIR level. In order to detect HW capabilities at runtime, we
> use SATP feature that ignores writes with an unsupported mode.
>
> Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
> ---
>  arch/riscv/Kconfig                      |   4 +-
>  arch/riscv/include/asm/csr.h            |   3 +-
>  arch/riscv/include/asm/fixmap.h         |   1 +
>  arch/riscv/include/asm/kasan.h          |   6 +-
>  arch/riscv/include/asm/page.h           |  14 ++
>  arch/riscv/include/asm/pgalloc.h        |  40 +++++
>  arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
>  arch/riscv/include/asm/pgtable.h        |  24 ++-
>  arch/riscv/kernel/head.S                |   3 +-
>  arch/riscv/mm/context.c                 |   4 +-
>  arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
>  arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
>  drivers/firmware/efi/libstub/efi-stub.c |   2 +
>  13 files changed, 514 insertions(+), 44 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index ac6c0cd9bc29..d28fe0148e13 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -150,7 +150,7 @@ config PAGE_OFFSET
>         hex
>         default 0xC0000000 if 32BIT
>         default 0x80000000 if 64BIT && !MMU
> -       default 0xffffffd800000000 if 64BIT
> +       default 0xffffaf8000000000 if 64BIT
>
>  config KASAN_SHADOW_OFFSET
>         hex
> @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM
>
>  config PGTABLE_LEVELS
>         int
> -       default 3 if 64BIT
> +       default 4 if 64BIT
>         default 2
>
>  config LOCKDEP_SUPPORT
> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> index 87ac65696871..3fdb971c7896 100644
> --- a/arch/riscv/include/asm/csr.h
> +++ b/arch/riscv/include/asm/csr.h
> @@ -40,14 +40,13 @@
>  #ifndef CONFIG_64BIT
>  #define SATP_PPN       _AC(0x003FFFFF, UL)
>  #define SATP_MODE_32   _AC(0x80000000, UL)
> -#define SATP_MODE      SATP_MODE_32
>  #define SATP_ASID_BITS 9
>  #define SATP_ASID_SHIFT        22
>  #define SATP_ASID_MASK _AC(0x1FF, UL)
>  #else
>  #define SATP_PPN       _AC(0x00000FFFFFFFFFFF, UL)
>  #define SATP_MODE_39   _AC(0x8000000000000000, UL)
> -#define SATP_MODE      SATP_MODE_39
> +#define SATP_MODE_48   _AC(0x9000000000000000, UL)
>  #define SATP_ASID_BITS 16
>  #define SATP_ASID_SHIFT        44
>  #define SATP_ASID_MASK _AC(0xFFFF, UL)
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index 54cbf07fb4e9..58a718573ad6 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -24,6 +24,7 @@ enum fixed_addresses {
>         FIX_HOLE,
>         FIX_PTE,
>         FIX_PMD,
> +       FIX_PUD,
>         FIX_TEXT_POKE1,
>         FIX_TEXT_POKE0,
>         FIX_EARLYCON_MEM_BASE,
> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> index 743e6ff57996..0b85e363e778 100644
> --- a/arch/riscv/include/asm/kasan.h
> +++ b/arch/riscv/include/asm/kasan.h
> @@ -28,7 +28,11 @@
>  #define KASAN_SHADOW_SCALE_SHIFT       3
>
>  #define KASAN_SHADOW_SIZE      (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
> -#define KASAN_SHADOW_START     (KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
> +/*
> + * Depending on the size of the virtual address space, the region may not be
> + * aligned on PGDIR_SIZE, so force its alignment to ease its population.
> + */
> +#define KASAN_SHADOW_START     ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
>  #define KASAN_SHADOW_END       MODULES_LOWEST_VADDR
>  #define KASAN_SHADOW_OFFSET    _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
>
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index e03559f9b35e..d089fe46f7d8 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -31,7 +31,20 @@
>   * When not using MMU this corresponds to the first free page in
>   * physical memory (aligned on a page boundary).
>   */
> +#ifdef CONFIG_64BIT
> +#ifdef CONFIG_MMU
> +#define PAGE_OFFSET            kernel_map.page_offset
> +#else
> +#define PAGE_OFFSET            _AC(CONFIG_PAGE_OFFSET, UL)
> +#endif
> +/*
> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> + * define the PAGE_OFFSET value for SV39.
> + */
> +#define PAGE_OFFSET_L3         _AC(0xffffffd800000000, UL)
> +#else
>  #define PAGE_OFFSET            _AC(CONFIG_PAGE_OFFSET, UL)
> +#endif /* CONFIG_64BIT */
>
>  /*
>   * Half of the kernel address space (half of the entries of the page global
> @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base;
>  #endif /* CONFIG_MMU */
>
>  struct kernel_mapping {
> +       unsigned long page_offset;
>         unsigned long virt_addr;
>         uintptr_t phys_addr;
>         uintptr_t size;
> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> index 0af6933a7100..11823004b87a 100644
> --- a/arch/riscv/include/asm/pgalloc.h
> +++ b/arch/riscv/include/asm/pgalloc.h
> @@ -11,6 +11,8 @@
>  #include <asm/tlb.h>
>
>  #ifdef CONFIG_MMU
> +#define __HAVE_ARCH_PUD_ALLOC_ONE
> +#define __HAVE_ARCH_PUD_FREE
>  #include <asm-generic/pgalloc.h>
>
>  static inline void pmd_populate_kernel(struct mm_struct *mm,
> @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
>
>         set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
>  }
> +
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> +{
> +       if (pgtable_l4_enabled) {
> +               unsigned long pfn = virt_to_pfn(pud);
> +
> +               set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> +       }
> +}
> +
> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> +                                    pud_t *pud)
> +{
> +       if (pgtable_l4_enabled) {
> +               unsigned long pfn = virt_to_pfn(pud);
> +
> +               set_p4d_safe(p4d,
> +                            __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> +       }
> +}
> +
> +#define pud_alloc_one pud_alloc_one
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> +       if (pgtable_l4_enabled)
> +               return __pud_alloc_one(mm, addr);
> +
> +       return NULL;
> +}
> +
> +#define pud_free pud_free
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> +       if (pgtable_l4_enabled)
> +               __pud_free(mm, pud);
> +}
> +
> +#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
>  #endif /* __PAGETABLE_PMD_FOLDED */
>
>  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index 228261aa9628..bbbdd66e5e2f 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -8,16 +8,36 @@
>
>  #include <linux/const.h>
>
> -#define PGDIR_SHIFT     30
> +extern bool pgtable_l4_enabled;
> +
> +#define PGDIR_SHIFT_L3  30
> +#define PGDIR_SHIFT_L4  39
> +#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
> +
> +#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
>  /* Size of region mapped by a page global directory */
>  #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
>  #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
>
> +/* pud is folded into pgd in case of 3-level page table */
> +#define PUD_SHIFT      30
> +#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
> +#define PUD_MASK       (~(PUD_SIZE - 1))
> +
>  #define PMD_SHIFT       21
>  /* Size of region mapped by a page middle directory */
>  #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
>  #define PMD_MASK        (~(PMD_SIZE - 1))
>
> +/* Page Upper Directory entry */
> +typedef struct {
> +       unsigned long pud;
> +} pud_t;
> +
> +#define pud_val(x)      ((x).pud)
> +#define __pud(x)        ((pud_t) { (x) })
> +#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
> +
>  /* Page Middle Directory entry */
>  typedef struct {
>         unsigned long pmd;
> @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
>         set_pud(pudp, __pud(0));
>  }
>
> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> +{
> +       return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> +}
> +
> +static inline unsigned long _pud_pfn(pud_t pud)
> +{
> +       return pud_val(pud) >> _PAGE_PFN_SHIFT;
> +}
> +
>  static inline pmd_t *pud_pgtable(pud_t pud)
>  {
>         return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
>         return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
>  }
>
> +#define mm_pud_folded  mm_pud_folded
> +static inline bool mm_pud_folded(struct mm_struct *mm)
> +{
> +       if (pgtable_l4_enabled)
> +               return false;
> +
> +       return true;
> +}
> +
> +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
> +
>  static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
>  {
>         return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
>  #define pmd_ERROR(e) \
>         pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
>
> +#define pud_ERROR(e)   \
> +       pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> +
> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> +{
> +       if (pgtable_l4_enabled)
> +               *p4dp = p4d;
> +       else
> +               set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> +}
> +
> +static inline int p4d_none(p4d_t p4d)
> +{
> +       if (pgtable_l4_enabled)
> +               return (p4d_val(p4d) == 0);
> +
> +       return 0;
> +}
> +
> +static inline int p4d_present(p4d_t p4d)
> +{
> +       if (pgtable_l4_enabled)
> +               return (p4d_val(p4d) & _PAGE_PRESENT);
> +
> +       return 1;
> +}
> +
> +static inline int p4d_bad(p4d_t p4d)
> +{
> +       if (pgtable_l4_enabled)
> +               return !p4d_present(p4d);
> +
> +       return 0;
> +}
> +
> +static inline void p4d_clear(p4d_t *p4d)
> +{
> +       if (pgtable_l4_enabled)
> +               set_p4d(p4d, __p4d(0));
> +}
> +
> +static inline pud_t *p4d_pgtable(p4d_t p4d)
> +{
> +       if (pgtable_l4_enabled)
> +               return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +
> +       return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
> +}
> +
> +static inline struct page *p4d_page(p4d_t p4d)
> +{
> +       return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +}
> +
> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> +
> +#define pud_offset pud_offset
> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +{
> +       if (pgtable_l4_enabled)
> +               return p4d_pgtable(*p4d) + pud_index(address);
> +
> +       return (pud_t *)p4d;
> +}
> +
>  #endif /* _ASM_RISCV_PGTABLE_64_H */
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index e1a52e22ad7e..e1c74ef4ead2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -51,7 +51,7 @@
>   * position vmemmap directly below the VMALLOC region.
>   */
>  #ifdef CONFIG_64BIT
> -#define VA_BITS                39
> +#define VA_BITS                (pgtable_l4_enabled ? 48 : 39)
>  #else
>  #define VA_BITS                32
>  #endif
> @@ -90,8 +90,7 @@
>
>  #ifndef __ASSEMBLY__
>
> -/* Page Upper Directory not used in RISC-V */
> -#include <asm-generic/pgtable-nopud.h>
> +#include <asm-generic/pgtable-nop4d.h>
>  #include <asm/page.h>
>  #include <asm/tlbflush.h>
>  #include <linux/mm_types.h>
> @@ -113,6 +112,17 @@
>  #define XIP_FIXUP(addr)                (addr)
>  #endif /* CONFIG_XIP_KERNEL */
>
> +struct pt_alloc_ops {
> +       pte_t *(*get_pte_virt)(phys_addr_t pa);
> +       phys_addr_t (*alloc_pte)(uintptr_t va);
> +#ifndef __PAGETABLE_PMD_FOLDED
> +       pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> +       phys_addr_t (*alloc_pmd)(uintptr_t va);
> +       pud_t *(*get_pud_virt)(phys_addr_t pa);
> +       phys_addr_t (*alloc_pud)(uintptr_t va);
> +#endif
> +};
> +
>  #ifdef CONFIG_MMU
>  /* Number of entries in the page global directory */
>  #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
> @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
>   * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
>   */
>  #ifdef CONFIG_64BIT
> -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
> +#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
> +#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
>  #else
> -#define TASK_SIZE FIXADDR_START
> +#define TASK_SIZE      FIXADDR_START
> +#define TASK_SIZE_MIN  TASK_SIZE
This is used by efi-stub.c, rv64 compat patch also need it, we reuse
DEFAULT_MAP_WINDOW_64 macro.

TASK_SIZE_MIN is also okay for me, I think it should be a separate
patch with efi-stub midification.
https://lore.kernel.org/linux-riscv/20211228143958.3409187-9-guoren@kernel.org/

I've merged your patchset with compat tree and we are testing them
together totally & carefully.
https://github.com/c-sky/csky-linux/tree/riscv_compat_v2_sv48_v3

Now, rv32_rootfs & 64_rootfs booting have been passed. But I would
give you tested-by later after totally tested. Your patch set is very
helpful, thx.

ps: Could you give chance let customer choice sv48 or sv39 in dts?


>  #endif
>
>  #else /* CONFIG_MMU */
> @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa;
>  #define dtb_early_va   _dtb_early_va
>  #define dtb_early_pa   _dtb_early_pa
>  #endif /* CONFIG_XIP_KERNEL */
> +extern u64 satp_mode;
> +extern bool pgtable_l4_enabled;
>
>  void paging_init(void);
>  void misc_mem_init(void);
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 52c5ff9804c5..c3c0ed559770 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -95,7 +95,8 @@ relocate:
>
>         /* Compute satp for kernel page tables, but don't load it yet */
>         srl a2, a0, PAGE_SHIFT
> -       li a1, SATP_MODE
> +       la a1, satp_mode
> +       REG_L a1, 0(a1)
>         or a2, a2, a1
>
>         /*
> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> index ee3459cb6750..a7246872bd30 100644
> --- a/arch/riscv/mm/context.c
> +++ b/arch/riscv/mm/context.c
> @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
>  switch_mm_fast:
>         csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
>                   ((cntx & asid_mask) << SATP_ASID_SHIFT) |
> -                 SATP_MODE);
> +                 satp_mode);
>
>         if (need_flush_tlb)
>                 local_flush_tlb_all();
> @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
>  static void set_mm_noasid(struct mm_struct *mm)
>  {
>         /* Switch the page table and blindly nuke entire local TLB */
> -       csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
> +       csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
>         local_flush_tlb_all();
>  }
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 1552226fb6bd..6a19a1b1caf8 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map);
>  #define kernel_map     (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
>  #endif
>
> +#ifdef CONFIG_64BIT
> +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
> +#else
> +u64 satp_mode = SATP_MODE_32;
> +#endif
> +EXPORT_SYMBOL(satp_mode);
> +
> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ?
> +                               true : false;
> +EXPORT_SYMBOL(pgtable_l4_enabled);
> +
>  phys_addr_t phys_ram_base __ro_after_init;
>  EXPORT_SYMBOL(phys_ram_base);
>
> @@ -53,15 +64,6 @@ extern char _start[];
>  void *_dtb_early_va __initdata;
>  uintptr_t _dtb_early_pa __initdata;
>
> -struct pt_alloc_ops {
> -       pte_t *(*get_pte_virt)(phys_addr_t pa);
> -       phys_addr_t (*alloc_pte)(uintptr_t va);
> -#ifndef __PAGETABLE_PMD_FOLDED
> -       pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> -       phys_addr_t (*alloc_pmd)(uintptr_t va);
> -#endif
> -};
> -
>  static phys_addr_t dma32_phys_limit __initdata;
>
>  static void __init zone_sizes_init(void)
> @@ -222,7 +224,7 @@ static void __init setup_bootmem(void)
>  }
>
>  #ifdef CONFIG_MMU
> -static struct pt_alloc_ops _pt_ops __initdata;
> +struct pt_alloc_ops _pt_ops __initdata;
>
>  #ifdef CONFIG_XIP_KERNEL
>  #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
> @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
>  static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
>
>  pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
> +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
>  static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>
>  #ifdef CONFIG_XIP_KERNEL
> @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>  #define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
>  #endif /* CONFIG_XIP_KERNEL */
>
> +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
> +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
> +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
> +
> +#ifdef CONFIG_XIP_KERNEL
> +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud))
> +#define fixmap_pud     ((pud_t *)XIP_FIXUP(fixmap_pud))
> +#define early_pud      ((pud_t *)XIP_FIXUP(early_pud))
> +#endif /* CONFIG_XIP_KERNEL */
> +
>  static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
>  {
>         /* Before MMU is enabled */
> @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
>
>  static phys_addr_t __init alloc_pmd_early(uintptr_t va)
>  {
> -       BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
> +       BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
>
>         return (uintptr_t)early_pmd;
>  }
> @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
>         create_pte_mapping(ptep, va, pa, sz, prot);
>  }
>
> -#define pgd_next_t             pmd_t
> -#define alloc_pgd_next(__va)   pt_ops.alloc_pmd(__va)
> -#define get_pgd_next_virt(__pa)        pt_ops.get_pmd_virt(__pa)
> +static pud_t *__init get_pud_virt_early(phys_addr_t pa)
> +{
> +       return (pud_t *)((uintptr_t)pa);
> +}
> +
> +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
> +{
> +       clear_fixmap(FIX_PUD);
> +       return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
> +}
> +
> +static pud_t *__init get_pud_virt_late(phys_addr_t pa)
> +{
> +       return (pud_t *)__va(pa);
> +}
> +
> +static phys_addr_t __init alloc_pud_early(uintptr_t va)
> +{
> +       /* Only one PUD is available for early mapping */
> +       BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
> +
> +       return (uintptr_t)early_pud;
> +}
> +
> +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
> +{
> +       return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> +}
> +
> +static phys_addr_t alloc_pud_late(uintptr_t va)
> +{
> +       unsigned long vaddr;
> +
> +       vaddr = __get_free_page(GFP_KERNEL);
> +       BUG_ON(!vaddr);
> +       return __pa(vaddr);
> +}
> +
> +static void __init create_pud_mapping(pud_t *pudp,
> +                                     uintptr_t va, phys_addr_t pa,
> +                                     phys_addr_t sz, pgprot_t prot)
> +{
> +       pmd_t *nextp;
> +       phys_addr_t next_phys;
> +       uintptr_t pud_index = pud_index(va);
> +
> +       if (sz == PUD_SIZE) {
> +               if (pud_val(pudp[pud_index]) == 0)
> +                       pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
> +               return;
> +       }
> +
> +       if (pud_val(pudp[pud_index]) == 0) {
> +               next_phys = pt_ops.alloc_pmd(va);
> +               pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
> +               nextp = pt_ops.get_pmd_virt(next_phys);
> +               memset(nextp, 0, PAGE_SIZE);
> +       } else {
> +               next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
> +               nextp = pt_ops.get_pmd_virt(next_phys);
> +       }
> +
> +       create_pmd_mapping(nextp, va, pa, sz, prot);
> +}
> +
> +#define pgd_next_t             pud_t
> +#define alloc_pgd_next(__va)   (pgtable_l4_enabled ?                   \
> +               pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))
> +#define get_pgd_next_virt(__pa)        (pgtable_l4_enabled ?                   \
> +               pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa))
>  #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)     \
> -       create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next                fixmap_pmd
> +                               (pgtable_l4_enabled ?                   \
> +               create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \
> +               create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))
> +#define fixmap_pgd_next                (pgtable_l4_enabled ?                   \
> +               (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
> +#define trampoline_pgd_next    (pgtable_l4_enabled ?                   \
> +               (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
> +#define early_dtb_pgd_next     (pgtable_l4_enabled ?                   \
> +               (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)
>  #else
>  #define pgd_next_t             pte_t
>  #define alloc_pgd_next(__va)   pt_ops.alloc_pte(__va)
>  #define get_pgd_next_virt(__pa)        pt_ops.get_pte_virt(__pa)
>  #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)     \
>         create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next                fixmap_pte
> +#define fixmap_pgd_next                ((uintptr_t)fixmap_pte)
> +#define early_dtb_pgd_next     ((uintptr_t)early_dtb_pmd)
> +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot)
>  #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot)
> -#endif
> +#endif /* __PAGETABLE_PMD_FOLDED */
>
>  void __init create_pgd_mapping(pgd_t *pgdp,
>                                       uintptr_t va, phys_addr_t pa,
> @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va)
>  }
>  #endif /* CONFIG_STRICT_KERNEL_RWX */
>
> +#ifdef CONFIG_64BIT
> +static void __init disable_pgtable_l4(void)
> +{
> +       pgtable_l4_enabled = false;
> +       kernel_map.page_offset = PAGE_OFFSET_L3;
> +       satp_mode = SATP_MODE_39;
> +}
> +
> +/*
> + * There is a simple way to determine if 4-level is supported by the
> + * underlying hardware: establish 1:1 mapping in 4-level page table mode
> + * then read SATP to see if the configuration was taken into account
> + * meaning sv48 is supported.
> + */
> +static __init void set_satp_mode(void)
> +{
> +       u64 identity_satp, hw_satp;
> +       uintptr_t set_satp_mode_pmd;
> +
> +       set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
> +       create_pgd_mapping(early_pg_dir,
> +                          set_satp_mode_pmd, (uintptr_t)early_pud,
> +                          PGDIR_SIZE, PAGE_TABLE);
> +       create_pud_mapping(early_pud,
> +                          set_satp_mode_pmd, (uintptr_t)early_pmd,
> +                          PUD_SIZE, PAGE_TABLE);
> +       /* Handle the case where set_satp_mode straddles 2 PMDs */
> +       create_pmd_mapping(early_pmd,
> +                          set_satp_mode_pmd, set_satp_mode_pmd,
> +                          PMD_SIZE, PAGE_KERNEL_EXEC);
> +       create_pmd_mapping(early_pmd,
> +                          set_satp_mode_pmd + PMD_SIZE,
> +                          set_satp_mode_pmd + PMD_SIZE,
> +                          PMD_SIZE, PAGE_KERNEL_EXEC);
> +
> +       identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> +
> +       local_flush_tlb_all();
> +       csr_write(CSR_SATP, identity_satp);
> +       hw_satp = csr_swap(CSR_SATP, 0ULL);
> +       local_flush_tlb_all();
> +
> +       if (hw_satp != identity_satp)
> +               disable_pgtable_l4();
> +
> +       memset(early_pg_dir, 0, PAGE_SIZE);
> +       memset(early_pud, 0, PAGE_SIZE);
> +       memset(early_pmd, 0, PAGE_SIZE);
> +}
> +#endif
> +
>  /*
>   * setup_vm() is called from head.S with MMU-off.
>   *
> @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
>         uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1);
>
>         create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA,
> -                          IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa,
> +                          IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa,
>                            PGDIR_SIZE,
>                            IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL);
>
> +       if (pgtable_l4_enabled) {
> +               create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA,
> +                                  (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE);
> +       }
> +
>         if (IS_ENABLED(CONFIG_64BIT)) {
>                 create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
>                                    pa, PMD_SIZE, PAGE_KERNEL);
> @@ -593,6 +738,8 @@ void pt_ops_set_early(void)
>  #ifndef __PAGETABLE_PMD_FOLDED
>         pt_ops.alloc_pmd = alloc_pmd_early;
>         pt_ops.get_pmd_virt = get_pmd_virt_early;
> +       pt_ops.alloc_pud = alloc_pud_early;
> +       pt_ops.get_pud_virt = get_pud_virt_early;
>  #endif
>  }
>
> @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void)
>  #ifndef __PAGETABLE_PMD_FOLDED
>         pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap);
>         pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap);
> +       pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap);
> +       pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap);
>  #endif
>  }
>
> @@ -625,6 +774,8 @@ void pt_ops_set_late(void)
>  #ifndef __PAGETABLE_PMD_FOLDED
>         pt_ops.alloc_pmd = alloc_pmd_late;
>         pt_ops.get_pmd_virt = get_pmd_virt_late;
> +       pt_ops.alloc_pud = alloc_pud_late;
> +       pt_ops.get_pud_virt = get_pud_virt_late;
>  #endif
>  }
>
> @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>         pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd;
>
>         kernel_map.virt_addr = KERNEL_LINK_ADDR;
> +       kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
>
>  #ifdef CONFIG_XIP_KERNEL
>         kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
> @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>         kernel_map.phys_addr = (uintptr_t)(&_start);
>         kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
>  #endif
> +
> +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
> +       set_satp_mode();
> +#endif
> +
>         kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
>         kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
>
> @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>
>         /* Setup early PGD for fixmap */
>         create_pgd_mapping(early_pg_dir, FIXADDR_START,
> -                          (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> +                          fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>
>  #ifndef __PAGETABLE_PMD_FOLDED
> -       /* Setup fixmap PMD */
> +       /* Setup fixmap PUD and PMD */
> +       if (pgtable_l4_enabled)
> +               create_pud_mapping(fixmap_pud, FIXADDR_START,
> +                                  (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
>         create_pmd_mapping(fixmap_pmd, FIXADDR_START,
>                            (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
>         /* Setup trampoline PGD and PMD */
>         create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
> -                          (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> +                          trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> +       if (pgtable_l4_enabled)
> +               create_pud_mapping(trampoline_pud, kernel_map.virt_addr,
> +                                  (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
>  #ifdef CONFIG_XIP_KERNEL
>         create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
>                            kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
> @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>          * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap
>          * range can not span multiple pmds.
>          */
> -       BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
> +       BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
>                      != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
>
>  #ifndef __PAGETABLE_PMD_FOLDED
> @@ -783,9 +946,10 @@ static void __init setup_vm_final(void)
>         /* Clear fixmap PTE and PMD mappings */
>         clear_fixmap(FIX_PTE);
>         clear_fixmap(FIX_PMD);
> +       clear_fixmap(FIX_PUD);
>
>         /* Move to swapper page table */
> -       csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> +       csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
>         local_flush_tlb_all();
>
>         pt_ops_set_late();
> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> index 1434a0225140..993f50571a3b 100644
> --- a/arch/riscv/mm/kasan_init.c
> +++ b/arch/riscv/mm/kasan_init.c
> @@ -11,7 +11,29 @@
>  #include <asm/fixmap.h>
>  #include <asm/pgalloc.h>
>
> +/*
> + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57
> + * which is right before the kernel.
> + *
> + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate
> + * the page global directory with kasan_early_shadow_pmd.
> + *
> + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping
> + * must be divided as follows:
> + * - the first PGD entry, although incomplete, is populated with
> + *   kasan_early_shadow_pud/p4d
> + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d
> + * - the last PGD entry is shared with the kernel mapping so populated at the
> + *   lower levels pud/p4d
> + *
> + * In addition, when shallow populating a kasan region (for example vmalloc),
> + * this region may also not be aligned on PGDIR size, so we must go down to the
> + * pud level too.
> + */
> +
>  extern pgd_t early_pg_dir[PTRS_PER_PGD];
> +extern struct pt_alloc_ops _pt_ops __initdata;
> +#define pt_ops _pt_ops
>
>  static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
>  {
> @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
>         set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
>  }
>
> -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
> +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end)
>  {
>         phys_addr_t phys_addr;
>         pmd_t *pmdp, *base_pmd;
>         unsigned long next;
>
> -       base_pmd = (pmd_t *)pgd_page_vaddr(*pgd);
> -       if (base_pmd == lm_alias(kasan_early_shadow_pmd))
> +       if (pud_none(*pud)) {
>                 base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
> +       } else {
> +               base_pmd = (pmd_t *)pud_pgtable(*pud);
> +               if (base_pmd == lm_alias(kasan_early_shadow_pmd))
> +                       base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
> +       }
>
>         pmdp = base_pmd + pmd_index(vaddr);
>
> @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned
>          * it entirely, memblock could allocate a page at a physical address
>          * where KASAN is not populated yet and then we'd get a page fault.
>          */
> -       set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> +       set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> +}
> +
> +static void __init kasan_populate_pud(pgd_t *pgd,
> +                                     unsigned long vaddr, unsigned long end,
> +                                     bool early)
> +{
> +       phys_addr_t phys_addr;
> +       pud_t *pudp, *base_pud;
> +       unsigned long next;
> +
> +       if (early) {
> +               /*
> +                * We can't use pgd_page_vaddr here as it would return a linear
> +                * mapping address but it is not mapped yet, but when populating
> +                * early_pg_dir, we need the physical address and when populating
> +                * swapper_pg_dir, we need the kernel virtual address so use
> +                * pt_ops facility.
> +                */
> +               base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd)));
> +       } else {
> +               base_pud = (pud_t *)pgd_page_vaddr(*pgd);
> +               if (base_pud == lm_alias(kasan_early_shadow_pud))
> +                       base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
> +       }
> +
> +       pudp = base_pud + pud_index(vaddr);
> +
> +       do {
> +               next = pud_addr_end(vaddr, end);
> +
> +               if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) {
> +                       if (early) {
> +                               phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd));
> +                               set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE));
> +                               continue;
> +                       } else {
> +                               phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE);
> +                               if (phys_addr) {
> +                                       set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL));
> +                                       continue;
> +                               }
> +                       }
> +               }
> +
> +               kasan_populate_pmd(pudp, vaddr, next);
> +       } while (pudp++, vaddr = next, vaddr != end);
> +
> +       /*
> +        * Wait for the whole PGD to be populated before setting the PGD in
> +        * the page table, otherwise, if we did set the PGD before populating
> +        * it entirely, memblock could allocate a page at a physical address
> +        * where KASAN is not populated yet and then we'd get a page fault.
> +        */
> +       if (!early)
> +               set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE));
>  }
>
> +#define kasan_early_shadow_pgd_next                    (pgtable_l4_enabled ?   \
> +                               (uintptr_t)kasan_early_shadow_pud :             \
> +                               (uintptr_t)kasan_early_shadow_pmd)
> +#define kasan_populate_pgd_next(pgdp, vaddr, next, early)                      \
> +               (pgtable_l4_enabled ?                                           \
> +                       kasan_populate_pud(pgdp, vaddr, next, early) :          \
> +                       kasan_populate_pmd((pud_t *)pgdp, vaddr, next))
> +
>  static void __init kasan_populate_pgd(pgd_t *pgdp,
>                                       unsigned long vaddr, unsigned long end,
>                                       bool early)
> @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp,
>                         }
>                 }
>
> -               kasan_populate_pmd(pgdp, vaddr, next);
> +               kasan_populate_pgd_next(pgdp, vaddr, next, early);
>         } while (pgdp++, vaddr = next, vaddr != end);
>  }
>
> @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end)
>         memset(start, KASAN_SHADOW_INIT, end - start);
>  }
>
> +static void __init kasan_shallow_populate_pud(pgd_t *pgdp,
> +                                             unsigned long vaddr, unsigned long end,
> +                                             bool kasan_populate)
> +{
> +       unsigned long next;
> +       pud_t *pudp, *base_pud;
> +       pmd_t *base_pmd;
> +       bool is_kasan_pmd;
> +
> +       base_pud = (pud_t *)pgd_page_vaddr(*pgdp);
> +       pudp = base_pud + pud_index(vaddr);
> +
> +       if (kasan_populate)
> +               memcpy(base_pud, (void *)kasan_early_shadow_pgd_next,
> +                      sizeof(pud_t) * PTRS_PER_PUD);
> +
> +       do {
> +               next = pud_addr_end(vaddr, end);
> +               is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd));
> +
> +               if (is_kasan_pmd) {
> +                       base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> +                       set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> +               }
> +       } while (pudp++, vaddr = next, vaddr != end);
> +}
> +
>  static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
>  {
>         unsigned long next;
>         void *p;
>         pgd_t *pgd_k = pgd_offset_k(vaddr);
> +       bool is_kasan_pgd_next;
>
>         do {
>                 next = pgd_addr_end(vaddr, end);
> -               if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
> +               is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) ==
> +                                    (unsigned long)lm_alias(kasan_early_shadow_pgd_next));
> +
> +               if (is_kasan_pgd_next) {
>                         p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
>                         set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
>                 }
> +
> +               if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE)
> +                       continue;
> +
> +               kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next);
>         } while (pgd_k++, vaddr = next, vaddr != end);
>  }
>
> diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
> index 26e69788f27a..b3db5d91ed38 100644
> --- a/drivers/firmware/efi/libstub/efi-stub.c
> +++ b/drivers/firmware/efi/libstub/efi-stub.c
> @@ -40,6 +40,8 @@
>
>  #ifdef CONFIG_ARM64
>  # define EFI_RT_VIRTUAL_LIMIT  DEFAULT_MAP_WINDOW_64
> +#elif defined(CONFIG_RISCV)
> +# define EFI_RT_VIRTUAL_LIMIT  TASK_SIZE_MIN
>  #else
>  # define EFI_RT_VIRTUAL_LIMIT  TASK_SIZE
>  #endif
> --
> 2.32.0
>


--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Alexandre Ghiti Jan. 4, 2022, 12:42 p.m. UTC | #5
Hi Guo,

On Wed, Dec 29, 2021 at 4:42 AM Guo Ren <guoren@kernel.org> wrote:
>
> On Tue, Dec 7, 2021 at 11:54 AM Alexandre Ghiti
> <alexandre.ghiti@canonical.com> wrote:
> >
> > By adding a new 4th level of page table, give the possibility to 64bit
> > kernel to address 2^48 bytes of virtual address: in practice, that offers
> > 128TB of virtual address space to userspace and allows up to 64TB of
> > physical memory.
> >
> > If the underlying hardware does not support sv48, we will automatically
> > fallback to a standard 3-level page table by folding the new PUD level into
> > PGDIR level. In order to detect HW capabilities at runtime, we
> > use SATP feature that ignores writes with an unsupported mode.
> >
> > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
> > ---
> >  arch/riscv/Kconfig                      |   4 +-
> >  arch/riscv/include/asm/csr.h            |   3 +-
> >  arch/riscv/include/asm/fixmap.h         |   1 +
> >  arch/riscv/include/asm/kasan.h          |   6 +-
> >  arch/riscv/include/asm/page.h           |  14 ++
> >  arch/riscv/include/asm/pgalloc.h        |  40 +++++
> >  arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
> >  arch/riscv/include/asm/pgtable.h        |  24 ++-
> >  arch/riscv/kernel/head.S                |   3 +-
> >  arch/riscv/mm/context.c                 |   4 +-
> >  arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
> >  arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
> >  drivers/firmware/efi/libstub/efi-stub.c |   2 +
> >  13 files changed, 514 insertions(+), 44 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index ac6c0cd9bc29..d28fe0148e13 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -150,7 +150,7 @@ config PAGE_OFFSET
> >         hex
> >         default 0xC0000000 if 32BIT
> >         default 0x80000000 if 64BIT && !MMU
> > -       default 0xffffffd800000000 if 64BIT
> > +       default 0xffffaf8000000000 if 64BIT
> >
> >  config KASAN_SHADOW_OFFSET
> >         hex
> > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM
> >
> >  config PGTABLE_LEVELS
> >         int
> > -       default 3 if 64BIT
> > +       default 4 if 64BIT
> >         default 2
> >
> >  config LOCKDEP_SUPPORT
> > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> > index 87ac65696871..3fdb971c7896 100644
> > --- a/arch/riscv/include/asm/csr.h
> > +++ b/arch/riscv/include/asm/csr.h
> > @@ -40,14 +40,13 @@
> >  #ifndef CONFIG_64BIT
> >  #define SATP_PPN       _AC(0x003FFFFF, UL)
> >  #define SATP_MODE_32   _AC(0x80000000, UL)
> > -#define SATP_MODE      SATP_MODE_32
> >  #define SATP_ASID_BITS 9
> >  #define SATP_ASID_SHIFT        22
> >  #define SATP_ASID_MASK _AC(0x1FF, UL)
> >  #else
> >  #define SATP_PPN       _AC(0x00000FFFFFFFFFFF, UL)
> >  #define SATP_MODE_39   _AC(0x8000000000000000, UL)
> > -#define SATP_MODE      SATP_MODE_39
> > +#define SATP_MODE_48   _AC(0x9000000000000000, UL)
> >  #define SATP_ASID_BITS 16
> >  #define SATP_ASID_SHIFT        44
> >  #define SATP_ASID_MASK _AC(0xFFFF, UL)
> > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> > index 54cbf07fb4e9..58a718573ad6 100644
> > --- a/arch/riscv/include/asm/fixmap.h
> > +++ b/arch/riscv/include/asm/fixmap.h
> > @@ -24,6 +24,7 @@ enum fixed_addresses {
> >         FIX_HOLE,
> >         FIX_PTE,
> >         FIX_PMD,
> > +       FIX_PUD,
> >         FIX_TEXT_POKE1,
> >         FIX_TEXT_POKE0,
> >         FIX_EARLYCON_MEM_BASE,
> > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> > index 743e6ff57996..0b85e363e778 100644
> > --- a/arch/riscv/include/asm/kasan.h
> > +++ b/arch/riscv/include/asm/kasan.h
> > @@ -28,7 +28,11 @@
> >  #define KASAN_SHADOW_SCALE_SHIFT       3
> >
> >  #define KASAN_SHADOW_SIZE      (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
> > -#define KASAN_SHADOW_START     (KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
> > +/*
> > + * Depending on the size of the virtual address space, the region may not be
> > + * aligned on PGDIR_SIZE, so force its alignment to ease its population.
> > + */
> > +#define KASAN_SHADOW_START     ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
> >  #define KASAN_SHADOW_END       MODULES_LOWEST_VADDR
> >  #define KASAN_SHADOW_OFFSET    _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
> >
> > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> > index e03559f9b35e..d089fe46f7d8 100644
> > --- a/arch/riscv/include/asm/page.h
> > +++ b/arch/riscv/include/asm/page.h
> > @@ -31,7 +31,20 @@
> >   * When not using MMU this corresponds to the first free page in
> >   * physical memory (aligned on a page boundary).
> >   */
> > +#ifdef CONFIG_64BIT
> > +#ifdef CONFIG_MMU
> > +#define PAGE_OFFSET            kernel_map.page_offset
> > +#else
> > +#define PAGE_OFFSET            _AC(CONFIG_PAGE_OFFSET, UL)
> > +#endif
> > +/*
> > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> > + * define the PAGE_OFFSET value for SV39.
> > + */
> > +#define PAGE_OFFSET_L3         _AC(0xffffffd800000000, UL)
> > +#else
> >  #define PAGE_OFFSET            _AC(CONFIG_PAGE_OFFSET, UL)
> > +#endif /* CONFIG_64BIT */
> >
> >  /*
> >   * Half of the kernel address space (half of the entries of the page global
> > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base;
> >  #endif /* CONFIG_MMU */
> >
> >  struct kernel_mapping {
> > +       unsigned long page_offset;
> >         unsigned long virt_addr;
> >         uintptr_t phys_addr;
> >         uintptr_t size;
> > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> > index 0af6933a7100..11823004b87a 100644
> > --- a/arch/riscv/include/asm/pgalloc.h
> > +++ b/arch/riscv/include/asm/pgalloc.h
> > @@ -11,6 +11,8 @@
> >  #include <asm/tlb.h>
> >
> >  #ifdef CONFIG_MMU
> > +#define __HAVE_ARCH_PUD_ALLOC_ONE
> > +#define __HAVE_ARCH_PUD_FREE
> >  #include <asm-generic/pgalloc.h>
> >
> >  static inline void pmd_populate_kernel(struct mm_struct *mm,
> > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
> >
> >         set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> >  }
> > +
> > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> > +{
> > +       if (pgtable_l4_enabled) {
> > +               unsigned long pfn = virt_to_pfn(pud);
> > +
> > +               set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> > +       }
> > +}
> > +
> > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> > +                                    pud_t *pud)
> > +{
> > +       if (pgtable_l4_enabled) {
> > +               unsigned long pfn = virt_to_pfn(pud);
> > +
> > +               set_p4d_safe(p4d,
> > +                            __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> > +       }
> > +}
> > +
> > +#define pud_alloc_one pud_alloc_one
> > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return __pud_alloc_one(mm, addr);
> > +
> > +       return NULL;
> > +}
> > +
> > +#define pud_free pud_free
> > +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               __pud_free(mm, pud);
> > +}
> > +
> > +#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
> >  #endif /* __PAGETABLE_PMD_FOLDED */
> >
> >  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> > index 228261aa9628..bbbdd66e5e2f 100644
> > --- a/arch/riscv/include/asm/pgtable-64.h
> > +++ b/arch/riscv/include/asm/pgtable-64.h
> > @@ -8,16 +8,36 @@
> >
> >  #include <linux/const.h>
> >
> > -#define PGDIR_SHIFT     30
> > +extern bool pgtable_l4_enabled;
> > +
> > +#define PGDIR_SHIFT_L3  30
> > +#define PGDIR_SHIFT_L4  39
> > +#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
> > +
> > +#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
> >  /* Size of region mapped by a page global directory */
> >  #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
> >  #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
> >
> > +/* pud is folded into pgd in case of 3-level page table */
> > +#define PUD_SHIFT      30
> > +#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
> > +#define PUD_MASK       (~(PUD_SIZE - 1))
> > +
> >  #define PMD_SHIFT       21
> >  /* Size of region mapped by a page middle directory */
> >  #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
> >  #define PMD_MASK        (~(PMD_SIZE - 1))
> >
> > +/* Page Upper Directory entry */
> > +typedef struct {
> > +       unsigned long pud;
> > +} pud_t;
> > +
> > +#define pud_val(x)      ((x).pud)
> > +#define __pud(x)        ((pud_t) { (x) })
> > +#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
> > +
> >  /* Page Middle Directory entry */
> >  typedef struct {
> >         unsigned long pmd;
> > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
> >         set_pud(pudp, __pud(0));
> >  }
> >
> > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> > +{
> > +       return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> > +}
> > +
> > +static inline unsigned long _pud_pfn(pud_t pud)
> > +{
> > +       return pud_val(pud) >> _PAGE_PFN_SHIFT;
> > +}
> > +
> >  static inline pmd_t *pud_pgtable(pud_t pud)
> >  {
> >         return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
> >         return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
> >  }
> >
> > +#define mm_pud_folded  mm_pud_folded
> > +static inline bool mm_pud_folded(struct mm_struct *mm)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return false;
> > +
> > +       return true;
> > +}
> > +
> > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
> > +
> >  static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
> >  {
> >         return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
> >  #define pmd_ERROR(e) \
> >         pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
> >
> > +#define pud_ERROR(e)   \
> > +       pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> > +
> > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               *p4dp = p4d;
> > +       else
> > +               set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> > +}
> > +
> > +static inline int p4d_none(p4d_t p4d)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return (p4d_val(p4d) == 0);
> > +
> > +       return 0;
> > +}
> > +
> > +static inline int p4d_present(p4d_t p4d)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return (p4d_val(p4d) & _PAGE_PRESENT);
> > +
> > +       return 1;
> > +}
> > +
> > +static inline int p4d_bad(p4d_t p4d)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return !p4d_present(p4d);
> > +
> > +       return 0;
> > +}
> > +
> > +static inline void p4d_clear(p4d_t *p4d)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               set_p4d(p4d, __p4d(0));
> > +}
> > +
> > +static inline pud_t *p4d_pgtable(p4d_t p4d)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> > +
> > +       return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
> > +}
> > +
> > +static inline struct page *p4d_page(p4d_t p4d)
> > +{
> > +       return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> > +}
> > +
> > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> > +
> > +#define pud_offset pud_offset
> > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> > +{
> > +       if (pgtable_l4_enabled)
> > +               return p4d_pgtable(*p4d) + pud_index(address);
> > +
> > +       return (pud_t *)p4d;
> > +}
> > +
> >  #endif /* _ASM_RISCV_PGTABLE_64_H */
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index e1a52e22ad7e..e1c74ef4ead2 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -51,7 +51,7 @@
> >   * position vmemmap directly below the VMALLOC region.
> >   */
> >  #ifdef CONFIG_64BIT
> > -#define VA_BITS                39
> > +#define VA_BITS                (pgtable_l4_enabled ? 48 : 39)
> >  #else
> >  #define VA_BITS                32
> >  #endif
> > @@ -90,8 +90,7 @@
> >
> >  #ifndef __ASSEMBLY__
> >
> > -/* Page Upper Directory not used in RISC-V */
> > -#include <asm-generic/pgtable-nopud.h>
> > +#include <asm-generic/pgtable-nop4d.h>
> >  #include <asm/page.h>
> >  #include <asm/tlbflush.h>
> >  #include <linux/mm_types.h>
> > @@ -113,6 +112,17 @@
> >  #define XIP_FIXUP(addr)                (addr)
> >  #endif /* CONFIG_XIP_KERNEL */
> >
> > +struct pt_alloc_ops {
> > +       pte_t *(*get_pte_virt)(phys_addr_t pa);
> > +       phys_addr_t (*alloc_pte)(uintptr_t va);
> > +#ifndef __PAGETABLE_PMD_FOLDED
> > +       pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> > +       phys_addr_t (*alloc_pmd)(uintptr_t va);
> > +       pud_t *(*get_pud_virt)(phys_addr_t pa);
> > +       phys_addr_t (*alloc_pud)(uintptr_t va);
> > +#endif
> > +};
> > +
> >  #ifdef CONFIG_MMU
> >  /* Number of entries in the page global directory */
> >  #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
> > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
> >   * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
> >   */
> >  #ifdef CONFIG_64BIT
> > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
> > +#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
> > +#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
> >  #else
> > -#define TASK_SIZE FIXADDR_START
> > +#define TASK_SIZE      FIXADDR_START
> > +#define TASK_SIZE_MIN  TASK_SIZE
> This is used by efi-stub.c, rv64 compat patch also need it, we reuse
> DEFAULT_MAP_WINDOW_64 macro.
>
> TASK_SIZE_MIN is also okay for me, I think it should be a separate
> patch with efi-stub midification.

IMO, TASK_SIZE_MIN is more explicit than DEFAULT_MAP_WINDOW_64. I'll
split this change in the next series.

> https://lore.kernel.org/linux-riscv/20211228143958.3409187-9-guoren@kernel.org/
>
> I've merged your patchset with compat tree and we are testing them
> together totally & carefully.
> https://github.com/c-sky/csky-linux/tree/riscv_compat_v2_sv48_v3
>
> Now, rv32_rootfs & 64_rootfs booting have been passed. But I would
> give you tested-by later after totally tested. Your patch set is very
> helpful, thx.

Thanks a lot, that will help move forward ;)

>
> ps: Could you give chance let customer choice sv48 or sv39 in dts?
>

This is already implemented in patch 13.

Thanks!

Alex

>
> >  #endif
> >
> >  #else /* CONFIG_MMU */
> > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa;
> >  #define dtb_early_va   _dtb_early_va
> >  #define dtb_early_pa   _dtb_early_pa
> >  #endif /* CONFIG_XIP_KERNEL */
> > +extern u64 satp_mode;
> > +extern bool pgtable_l4_enabled;
> >
> >  void paging_init(void);
> >  void misc_mem_init(void);
> > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> > index 52c5ff9804c5..c3c0ed559770 100644
> > --- a/arch/riscv/kernel/head.S
> > +++ b/arch/riscv/kernel/head.S
> > @@ -95,7 +95,8 @@ relocate:
> >
> >         /* Compute satp for kernel page tables, but don't load it yet */
> >         srl a2, a0, PAGE_SHIFT
> > -       li a1, SATP_MODE
> > +       la a1, satp_mode
> > +       REG_L a1, 0(a1)
> >         or a2, a2, a1
> >
> >         /*
> > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> > index ee3459cb6750..a7246872bd30 100644
> > --- a/arch/riscv/mm/context.c
> > +++ b/arch/riscv/mm/context.c
> > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
> >  switch_mm_fast:
> >         csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
> >                   ((cntx & asid_mask) << SATP_ASID_SHIFT) |
> > -                 SATP_MODE);
> > +                 satp_mode);
> >
> >         if (need_flush_tlb)
> >                 local_flush_tlb_all();
> > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
> >  static void set_mm_noasid(struct mm_struct *mm)
> >  {
> >         /* Switch the page table and blindly nuke entire local TLB */
> > -       csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
> > +       csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
> >         local_flush_tlb_all();
> >  }
> >
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index 1552226fb6bd..6a19a1b1caf8 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map);
> >  #define kernel_map     (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
> >  #endif
> >
> > +#ifdef CONFIG_64BIT
> > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
> > +#else
> > +u64 satp_mode = SATP_MODE_32;
> > +#endif
> > +EXPORT_SYMBOL(satp_mode);
> > +
> > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ?
> > +                               true : false;
> > +EXPORT_SYMBOL(pgtable_l4_enabled);
> > +
> >  phys_addr_t phys_ram_base __ro_after_init;
> >  EXPORT_SYMBOL(phys_ram_base);
> >
> > @@ -53,15 +64,6 @@ extern char _start[];
> >  void *_dtb_early_va __initdata;
> >  uintptr_t _dtb_early_pa __initdata;
> >
> > -struct pt_alloc_ops {
> > -       pte_t *(*get_pte_virt)(phys_addr_t pa);
> > -       phys_addr_t (*alloc_pte)(uintptr_t va);
> > -#ifndef __PAGETABLE_PMD_FOLDED
> > -       pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> > -       phys_addr_t (*alloc_pmd)(uintptr_t va);
> > -#endif
> > -};
> > -
> >  static phys_addr_t dma32_phys_limit __initdata;
> >
> >  static void __init zone_sizes_init(void)
> > @@ -222,7 +224,7 @@ static void __init setup_bootmem(void)
> >  }
> >
> >  #ifdef CONFIG_MMU
> > -static struct pt_alloc_ops _pt_ops __initdata;
> > +struct pt_alloc_ops _pt_ops __initdata;
> >
> >  #ifdef CONFIG_XIP_KERNEL
> >  #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
> > @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
> >  static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
> >
> >  pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
> > +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
> >  static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
> >
> >  #ifdef CONFIG_XIP_KERNEL
> > @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
> >  #define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
> >  #endif /* CONFIG_XIP_KERNEL */
> >
> > +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
> > +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
> > +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
> > +
> > +#ifdef CONFIG_XIP_KERNEL
> > +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud))
> > +#define fixmap_pud     ((pud_t *)XIP_FIXUP(fixmap_pud))
> > +#define early_pud      ((pud_t *)XIP_FIXUP(early_pud))
> > +#endif /* CONFIG_XIP_KERNEL */
> > +
> >  static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
> >  {
> >         /* Before MMU is enabled */
> > @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
> >
> >  static phys_addr_t __init alloc_pmd_early(uintptr_t va)
> >  {
> > -       BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
> > +       BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
> >
> >         return (uintptr_t)early_pmd;
> >  }
> > @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> >         create_pte_mapping(ptep, va, pa, sz, prot);
> >  }
> >
> > -#define pgd_next_t             pmd_t
> > -#define alloc_pgd_next(__va)   pt_ops.alloc_pmd(__va)
> > -#define get_pgd_next_virt(__pa)        pt_ops.get_pmd_virt(__pa)
> > +static pud_t *__init get_pud_virt_early(phys_addr_t pa)
> > +{
> > +       return (pud_t *)((uintptr_t)pa);
> > +}
> > +
> > +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
> > +{
> > +       clear_fixmap(FIX_PUD);
> > +       return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
> > +}
> > +
> > +static pud_t *__init get_pud_virt_late(phys_addr_t pa)
> > +{
> > +       return (pud_t *)__va(pa);
> > +}
> > +
> > +static phys_addr_t __init alloc_pud_early(uintptr_t va)
> > +{
> > +       /* Only one PUD is available for early mapping */
> > +       BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
> > +
> > +       return (uintptr_t)early_pud;
> > +}
> > +
> > +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
> > +{
> > +       return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> > +}
> > +
> > +static phys_addr_t alloc_pud_late(uintptr_t va)
> > +{
> > +       unsigned long vaddr;
> > +
> > +       vaddr = __get_free_page(GFP_KERNEL);
> > +       BUG_ON(!vaddr);
> > +       return __pa(vaddr);
> > +}
> > +
> > +static void __init create_pud_mapping(pud_t *pudp,
> > +                                     uintptr_t va, phys_addr_t pa,
> > +                                     phys_addr_t sz, pgprot_t prot)
> > +{
> > +       pmd_t *nextp;
> > +       phys_addr_t next_phys;
> > +       uintptr_t pud_index = pud_index(va);
> > +
> > +       if (sz == PUD_SIZE) {
> > +               if (pud_val(pudp[pud_index]) == 0)
> > +                       pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
> > +               return;
> > +       }
> > +
> > +       if (pud_val(pudp[pud_index]) == 0) {
> > +               next_phys = pt_ops.alloc_pmd(va);
> > +               pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
> > +               nextp = pt_ops.get_pmd_virt(next_phys);
> > +               memset(nextp, 0, PAGE_SIZE);
> > +       } else {
> > +               next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
> > +               nextp = pt_ops.get_pmd_virt(next_phys);
> > +       }
> > +
> > +       create_pmd_mapping(nextp, va, pa, sz, prot);
> > +}
> > +
> > +#define pgd_next_t             pud_t
> > +#define alloc_pgd_next(__va)   (pgtable_l4_enabled ?                   \
> > +               pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))
> > +#define get_pgd_next_virt(__pa)        (pgtable_l4_enabled ?                   \
> > +               pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa))
> >  #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)     \
> > -       create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> > -#define fixmap_pgd_next                fixmap_pmd
> > +                               (pgtable_l4_enabled ?                   \
> > +               create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \
> > +               create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))
> > +#define fixmap_pgd_next                (pgtable_l4_enabled ?                   \
> > +               (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
> > +#define trampoline_pgd_next    (pgtable_l4_enabled ?                   \
> > +               (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
> > +#define early_dtb_pgd_next     (pgtable_l4_enabled ?                   \
> > +               (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)
> >  #else
> >  #define pgd_next_t             pte_t
> >  #define alloc_pgd_next(__va)   pt_ops.alloc_pte(__va)
> >  #define get_pgd_next_virt(__pa)        pt_ops.get_pte_virt(__pa)
> >  #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)     \
> >         create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> > -#define fixmap_pgd_next                fixmap_pte
> > +#define fixmap_pgd_next                ((uintptr_t)fixmap_pte)
> > +#define early_dtb_pgd_next     ((uintptr_t)early_dtb_pmd)
> > +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot)
> >  #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot)
> > -#endif
> > +#endif /* __PAGETABLE_PMD_FOLDED */
> >
> >  void __init create_pgd_mapping(pgd_t *pgdp,
> >                                       uintptr_t va, phys_addr_t pa,
> > @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va)
> >  }
> >  #endif /* CONFIG_STRICT_KERNEL_RWX */
> >
> > +#ifdef CONFIG_64BIT
> > +static void __init disable_pgtable_l4(void)
> > +{
> > +       pgtable_l4_enabled = false;
> > +       kernel_map.page_offset = PAGE_OFFSET_L3;
> > +       satp_mode = SATP_MODE_39;
> > +}
> > +
> > +/*
> > + * There is a simple way to determine if 4-level is supported by the
> > + * underlying hardware: establish 1:1 mapping in 4-level page table mode
> > + * then read SATP to see if the configuration was taken into account
> > + * meaning sv48 is supported.
> > + */
> > +static __init void set_satp_mode(void)
> > +{
> > +       u64 identity_satp, hw_satp;
> > +       uintptr_t set_satp_mode_pmd;
> > +
> > +       set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
> > +       create_pgd_mapping(early_pg_dir,
> > +                          set_satp_mode_pmd, (uintptr_t)early_pud,
> > +                          PGDIR_SIZE, PAGE_TABLE);
> > +       create_pud_mapping(early_pud,
> > +                          set_satp_mode_pmd, (uintptr_t)early_pmd,
> > +                          PUD_SIZE, PAGE_TABLE);
> > +       /* Handle the case where set_satp_mode straddles 2 PMDs */
> > +       create_pmd_mapping(early_pmd,
> > +                          set_satp_mode_pmd, set_satp_mode_pmd,
> > +                          PMD_SIZE, PAGE_KERNEL_EXEC);
> > +       create_pmd_mapping(early_pmd,
> > +                          set_satp_mode_pmd + PMD_SIZE,
> > +                          set_satp_mode_pmd + PMD_SIZE,
> > +                          PMD_SIZE, PAGE_KERNEL_EXEC);
> > +
> > +       identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> > +
> > +       local_flush_tlb_all();
> > +       csr_write(CSR_SATP, identity_satp);
> > +       hw_satp = csr_swap(CSR_SATP, 0ULL);
> > +       local_flush_tlb_all();
> > +
> > +       if (hw_satp != identity_satp)
> > +               disable_pgtable_l4();
> > +
> > +       memset(early_pg_dir, 0, PAGE_SIZE);
> > +       memset(early_pud, 0, PAGE_SIZE);
> > +       memset(early_pmd, 0, PAGE_SIZE);
> > +}
> > +#endif
> > +
> >  /*
> >   * setup_vm() is called from head.S with MMU-off.
> >   *
> > @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
> >         uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1);
> >
> >         create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA,
> > -                          IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa,
> > +                          IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa,
> >                            PGDIR_SIZE,
> >                            IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL);
> >
> > +       if (pgtable_l4_enabled) {
> > +               create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA,
> > +                                  (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE);
> > +       }
> > +
> >         if (IS_ENABLED(CONFIG_64BIT)) {
> >                 create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
> >                                    pa, PMD_SIZE, PAGE_KERNEL);
> > @@ -593,6 +738,8 @@ void pt_ops_set_early(void)
> >  #ifndef __PAGETABLE_PMD_FOLDED
> >         pt_ops.alloc_pmd = alloc_pmd_early;
> >         pt_ops.get_pmd_virt = get_pmd_virt_early;
> > +       pt_ops.alloc_pud = alloc_pud_early;
> > +       pt_ops.get_pud_virt = get_pud_virt_early;
> >  #endif
> >  }
> >
> > @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void)
> >  #ifndef __PAGETABLE_PMD_FOLDED
> >         pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap);
> >         pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap);
> > +       pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap);
> > +       pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap);
> >  #endif
> >  }
> >
> > @@ -625,6 +774,8 @@ void pt_ops_set_late(void)
> >  #ifndef __PAGETABLE_PMD_FOLDED
> >         pt_ops.alloc_pmd = alloc_pmd_late;
> >         pt_ops.get_pmd_virt = get_pmd_virt_late;
> > +       pt_ops.alloc_pud = alloc_pud_late;
> > +       pt_ops.get_pud_virt = get_pud_virt_late;
> >  #endif
> >  }
> >
> > @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >         pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd;
> >
> >         kernel_map.virt_addr = KERNEL_LINK_ADDR;
> > +       kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
> >
> >  #ifdef CONFIG_XIP_KERNEL
> >         kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
> > @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >         kernel_map.phys_addr = (uintptr_t)(&_start);
> >         kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
> >  #endif
> > +
> > +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
> > +       set_satp_mode();
> > +#endif
> > +
> >         kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
> >         kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
> >
> > @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >
> >         /* Setup early PGD for fixmap */
> >         create_pgd_mapping(early_pg_dir, FIXADDR_START,
> > -                          (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> > +                          fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> >
> >  #ifndef __PAGETABLE_PMD_FOLDED
> > -       /* Setup fixmap PMD */
> > +       /* Setup fixmap PUD and PMD */
> > +       if (pgtable_l4_enabled)
> > +               create_pud_mapping(fixmap_pud, FIXADDR_START,
> > +                                  (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
> >         create_pmd_mapping(fixmap_pmd, FIXADDR_START,
> >                            (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
> >         /* Setup trampoline PGD and PMD */
> >         create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
> > -                          (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> > +                          trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> > +       if (pgtable_l4_enabled)
> > +               create_pud_mapping(trampoline_pud, kernel_map.virt_addr,
> > +                                  (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
> >  #ifdef CONFIG_XIP_KERNEL
> >         create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
> >                            kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
> > @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >          * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap
> >          * range can not span multiple pmds.
> >          */
> > -       BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
> > +       BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
> >                      != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
> >
> >  #ifndef __PAGETABLE_PMD_FOLDED
> > @@ -783,9 +946,10 @@ static void __init setup_vm_final(void)
> >         /* Clear fixmap PTE and PMD mappings */
> >         clear_fixmap(FIX_PTE);
> >         clear_fixmap(FIX_PMD);
> > +       clear_fixmap(FIX_PUD);
> >
> >         /* Move to swapper page table */
> > -       csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> > +       csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
> >         local_flush_tlb_all();
> >
> >         pt_ops_set_late();
> > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> > index 1434a0225140..993f50571a3b 100644
> > --- a/arch/riscv/mm/kasan_init.c
> > +++ b/arch/riscv/mm/kasan_init.c
> > @@ -11,7 +11,29 @@
> >  #include <asm/fixmap.h>
> >  #include <asm/pgalloc.h>
> >
> > +/*
> > + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57
> > + * which is right before the kernel.
> > + *
> > + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate
> > + * the page global directory with kasan_early_shadow_pmd.
> > + *
> > + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping
> > + * must be divided as follows:
> > + * - the first PGD entry, although incomplete, is populated with
> > + *   kasan_early_shadow_pud/p4d
> > + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d
> > + * - the last PGD entry is shared with the kernel mapping so populated at the
> > + *   lower levels pud/p4d
> > + *
> > + * In addition, when shallow populating a kasan region (for example vmalloc),
> > + * this region may also not be aligned on PGDIR size, so we must go down to the
> > + * pud level too.
> > + */
> > +
> >  extern pgd_t early_pg_dir[PTRS_PER_PGD];
> > +extern struct pt_alloc_ops _pt_ops __initdata;
> > +#define pt_ops _pt_ops
> >
> >  static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
> >  {
> > @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
> >         set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
> >  }
> >
> > -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
> > +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end)
> >  {
> >         phys_addr_t phys_addr;
> >         pmd_t *pmdp, *base_pmd;
> >         unsigned long next;
> >
> > -       base_pmd = (pmd_t *)pgd_page_vaddr(*pgd);
> > -       if (base_pmd == lm_alias(kasan_early_shadow_pmd))
> > +       if (pud_none(*pud)) {
> >                 base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
> > +       } else {
> > +               base_pmd = (pmd_t *)pud_pgtable(*pud);
> > +               if (base_pmd == lm_alias(kasan_early_shadow_pmd))
> > +                       base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
> > +       }
> >
> >         pmdp = base_pmd + pmd_index(vaddr);
> >
> > @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned
> >          * it entirely, memblock could allocate a page at a physical address
> >          * where KASAN is not populated yet and then we'd get a page fault.
> >          */
> > -       set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> > +       set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> > +}
> > +
> > +static void __init kasan_populate_pud(pgd_t *pgd,
> > +                                     unsigned long vaddr, unsigned long end,
> > +                                     bool early)
> > +{
> > +       phys_addr_t phys_addr;
> > +       pud_t *pudp, *base_pud;
> > +       unsigned long next;
> > +
> > +       if (early) {
> > +               /*
> > +                * We can't use pgd_page_vaddr here as it would return a linear
> > +                * mapping address but it is not mapped yet, but when populating
> > +                * early_pg_dir, we need the physical address and when populating
> > +                * swapper_pg_dir, we need the kernel virtual address so use
> > +                * pt_ops facility.
> > +                */
> > +               base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd)));
> > +       } else {
> > +               base_pud = (pud_t *)pgd_page_vaddr(*pgd);
> > +               if (base_pud == lm_alias(kasan_early_shadow_pud))
> > +                       base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
> > +       }
> > +
> > +       pudp = base_pud + pud_index(vaddr);
> > +
> > +       do {
> > +               next = pud_addr_end(vaddr, end);
> > +
> > +               if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) {
> > +                       if (early) {
> > +                               phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd));
> > +                               set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE));
> > +                               continue;
> > +                       } else {
> > +                               phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE);
> > +                               if (phys_addr) {
> > +                                       set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL));
> > +                                       continue;
> > +                               }
> > +                       }
> > +               }
> > +
> > +               kasan_populate_pmd(pudp, vaddr, next);
> > +       } while (pudp++, vaddr = next, vaddr != end);
> > +
> > +       /*
> > +        * Wait for the whole PGD to be populated before setting the PGD in
> > +        * the page table, otherwise, if we did set the PGD before populating
> > +        * it entirely, memblock could allocate a page at a physical address
> > +        * where KASAN is not populated yet and then we'd get a page fault.
> > +        */
> > +       if (!early)
> > +               set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE));
> >  }
> >
> > +#define kasan_early_shadow_pgd_next                    (pgtable_l4_enabled ?   \
> > +                               (uintptr_t)kasan_early_shadow_pud :             \
> > +                               (uintptr_t)kasan_early_shadow_pmd)
> > +#define kasan_populate_pgd_next(pgdp, vaddr, next, early)                      \
> > +               (pgtable_l4_enabled ?                                           \
> > +                       kasan_populate_pud(pgdp, vaddr, next, early) :          \
> > +                       kasan_populate_pmd((pud_t *)pgdp, vaddr, next))
> > +
> >  static void __init kasan_populate_pgd(pgd_t *pgdp,
> >                                       unsigned long vaddr, unsigned long end,
> >                                       bool early)
> > @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp,
> >                         }
> >                 }
> >
> > -               kasan_populate_pmd(pgdp, vaddr, next);
> > +               kasan_populate_pgd_next(pgdp, vaddr, next, early);
> >         } while (pgdp++, vaddr = next, vaddr != end);
> >  }
> >
> > @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end)
> >         memset(start, KASAN_SHADOW_INIT, end - start);
> >  }
> >
> > +static void __init kasan_shallow_populate_pud(pgd_t *pgdp,
> > +                                             unsigned long vaddr, unsigned long end,
> > +                                             bool kasan_populate)
> > +{
> > +       unsigned long next;
> > +       pud_t *pudp, *base_pud;
> > +       pmd_t *base_pmd;
> > +       bool is_kasan_pmd;
> > +
> > +       base_pud = (pud_t *)pgd_page_vaddr(*pgdp);
> > +       pudp = base_pud + pud_index(vaddr);
> > +
> > +       if (kasan_populate)
> > +               memcpy(base_pud, (void *)kasan_early_shadow_pgd_next,
> > +                      sizeof(pud_t) * PTRS_PER_PUD);
> > +
> > +       do {
> > +               next = pud_addr_end(vaddr, end);
> > +               is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd));
> > +
> > +               if (is_kasan_pmd) {
> > +                       base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> > +                       set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
> > +               }
> > +       } while (pudp++, vaddr = next, vaddr != end);
> > +}
> > +
> >  static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
> >  {
> >         unsigned long next;
> >         void *p;
> >         pgd_t *pgd_k = pgd_offset_k(vaddr);
> > +       bool is_kasan_pgd_next;
> >
> >         do {
> >                 next = pgd_addr_end(vaddr, end);
> > -               if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
> > +               is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) ==
> > +                                    (unsigned long)lm_alias(kasan_early_shadow_pgd_next));
> > +
> > +               if (is_kasan_pgd_next) {
> >                         p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> >                         set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
> >                 }
> > +
> > +               if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE)
> > +                       continue;
> > +
> > +               kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next);
> >         } while (pgd_k++, vaddr = next, vaddr != end);
> >  }
> >
> > diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
> > index 26e69788f27a..b3db5d91ed38 100644
> > --- a/drivers/firmware/efi/libstub/efi-stub.c
> > +++ b/drivers/firmware/efi/libstub/efi-stub.c
> > @@ -40,6 +40,8 @@
> >
> >  #ifdef CONFIG_ARM64
> >  # define EFI_RT_VIRTUAL_LIMIT  DEFAULT_MAP_WINDOW_64
> > +#elif defined(CONFIG_RISCV)
> > +# define EFI_RT_VIRTUAL_LIMIT  TASK_SIZE_MIN
> >  #else
> >  # define EFI_RT_VIRTUAL_LIMIT  TASK_SIZE
> >  #endif
> > --
> > 2.32.0
> >
>
>
> --
> Best Regards
>  Guo Ren
>
> ML: https://lore.kernel.org/linux-csky/
Alexandre Ghiti Jan. 4, 2022, 12:44 p.m. UTC | #6
Hi Jisheng,

On Sun, Dec 26, 2021 at 10:06 AM Jisheng Zhang
<jszhang3@mail.ustc.edu.cn> wrote:
>
> On Mon,  6 Dec 2021 11:46:51 +0100
> Alexandre Ghiti <alexandre.ghiti@canonical.com> wrote:
>
> > By adding a new 4th level of page table, give the possibility to 64bit
> > kernel to address 2^48 bytes of virtual address: in practice, that offers
> > 128TB of virtual address space to userspace and allows up to 64TB of
> > physical memory.
> >
> > If the underlying hardware does not support sv48, we will automatically
> > fallback to a standard 3-level page table by folding the new PUD level into
> > PGDIR level. In order to detect HW capabilities at runtime, we
> > use SATP feature that ignores writes with an unsupported mode.
> >
> > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
> > ---
> >  arch/riscv/Kconfig                      |   4 +-
> >  arch/riscv/include/asm/csr.h            |   3 +-
> >  arch/riscv/include/asm/fixmap.h         |   1 +
> >  arch/riscv/include/asm/kasan.h          |   6 +-
> >  arch/riscv/include/asm/page.h           |  14 ++
> >  arch/riscv/include/asm/pgalloc.h        |  40 +++++
> >  arch/riscv/include/asm/pgtable-64.h     | 108 +++++++++++-
> >  arch/riscv/include/asm/pgtable.h        |  24 ++-
> >  arch/riscv/kernel/head.S                |   3 +-
> >  arch/riscv/mm/context.c                 |   4 +-
> >  arch/riscv/mm/init.c                    | 212 +++++++++++++++++++++---
> >  arch/riscv/mm/kasan_init.c              | 137 ++++++++++++++-
> >  drivers/firmware/efi/libstub/efi-stub.c |   2 +
> >  13 files changed, 514 insertions(+), 44 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index ac6c0cd9bc29..d28fe0148e13 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -150,7 +150,7 @@ config PAGE_OFFSET
> >       hex
> >       default 0xC0000000 if 32BIT
> >       default 0x80000000 if 64BIT && !MMU
> > -     default 0xffffffd800000000 if 64BIT
> > +     default 0xffffaf8000000000 if 64BIT
> >
> >  config KASAN_SHADOW_OFFSET
> >       hex
> > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM
> >
> >  config PGTABLE_LEVELS
> >       int
> > -     default 3 if 64BIT
> > +     default 4 if 64BIT
> >       default 2
> >
> >  config LOCKDEP_SUPPORT
> > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> > index 87ac65696871..3fdb971c7896 100644
> > --- a/arch/riscv/include/asm/csr.h
> > +++ b/arch/riscv/include/asm/csr.h
> > @@ -40,14 +40,13 @@
> >  #ifndef CONFIG_64BIT
> >  #define SATP_PPN     _AC(0x003FFFFF, UL)
> >  #define SATP_MODE_32 _AC(0x80000000, UL)
> > -#define SATP_MODE    SATP_MODE_32
> >  #define SATP_ASID_BITS       9
> >  #define SATP_ASID_SHIFT      22
> >  #define SATP_ASID_MASK       _AC(0x1FF, UL)
> >  #else
> >  #define SATP_PPN     _AC(0x00000FFFFFFFFFFF, UL)
> >  #define SATP_MODE_39 _AC(0x8000000000000000, UL)
> > -#define SATP_MODE    SATP_MODE_39
> > +#define SATP_MODE_48 _AC(0x9000000000000000, UL)
> >  #define SATP_ASID_BITS       16
> >  #define SATP_ASID_SHIFT      44
> >  #define SATP_ASID_MASK       _AC(0xFFFF, UL)
> > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> > index 54cbf07fb4e9..58a718573ad6 100644
> > --- a/arch/riscv/include/asm/fixmap.h
> > +++ b/arch/riscv/include/asm/fixmap.h
> > @@ -24,6 +24,7 @@ enum fixed_addresses {
> >       FIX_HOLE,
> >       FIX_PTE,
> >       FIX_PMD,
> > +     FIX_PUD,
> >       FIX_TEXT_POKE1,
> >       FIX_TEXT_POKE0,
> >       FIX_EARLYCON_MEM_BASE,
> > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> > index 743e6ff57996..0b85e363e778 100644
> > --- a/arch/riscv/include/asm/kasan.h
> > +++ b/arch/riscv/include/asm/kasan.h
> > @@ -28,7 +28,11 @@
> >  #define KASAN_SHADOW_SCALE_SHIFT     3
> >
> >  #define KASAN_SHADOW_SIZE    (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
> > -#define KASAN_SHADOW_START   (KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
> > +/*
> > + * Depending on the size of the virtual address space, the region may not be
> > + * aligned on PGDIR_SIZE, so force its alignment to ease its population.
> > + */
> > +#define KASAN_SHADOW_START   ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
> >  #define KASAN_SHADOW_END     MODULES_LOWEST_VADDR
> >  #define KASAN_SHADOW_OFFSET  _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
> >
> > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> > index e03559f9b35e..d089fe46f7d8 100644
> > --- a/arch/riscv/include/asm/page.h
> > +++ b/arch/riscv/include/asm/page.h
> > @@ -31,7 +31,20 @@
> >   * When not using MMU this corresponds to the first free page in
> >   * physical memory (aligned on a page boundary).
> >   */
> > +#ifdef CONFIG_64BIT
> > +#ifdef CONFIG_MMU
> > +#define PAGE_OFFSET          kernel_map.page_offset
> > +#else
> > +#define PAGE_OFFSET          _AC(CONFIG_PAGE_OFFSET, UL)
> > +#endif
> > +/*
> > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> > + * define the PAGE_OFFSET value for SV39.
> > + */
> > +#define PAGE_OFFSET_L3               _AC(0xffffffd800000000, UL)
> > +#else
> >  #define PAGE_OFFSET          _AC(CONFIG_PAGE_OFFSET, UL)
> > +#endif /* CONFIG_64BIT */
> >
> >  /*
> >   * Half of the kernel address space (half of the entries of the page global
> > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base;
> >  #endif /* CONFIG_MMU */
> >
> >  struct kernel_mapping {
> > +     unsigned long page_offset;
> >       unsigned long virt_addr;
> >       uintptr_t phys_addr;
> >       uintptr_t size;
> > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> > index 0af6933a7100..11823004b87a 100644
> > --- a/arch/riscv/include/asm/pgalloc.h
> > +++ b/arch/riscv/include/asm/pgalloc.h
> > @@ -11,6 +11,8 @@
> >  #include <asm/tlb.h>
> >
> >  #ifdef CONFIG_MMU
> > +#define __HAVE_ARCH_PUD_ALLOC_ONE
> > +#define __HAVE_ARCH_PUD_FREE
> >  #include <asm-generic/pgalloc.h>
> >
> >  static inline void pmd_populate_kernel(struct mm_struct *mm,
> > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
> >
> >       set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> >  }
> > +
> > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> > +{
> > +     if (pgtable_l4_enabled) {
> > +             unsigned long pfn = virt_to_pfn(pud);
> > +
> > +             set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> > +     }
> > +}
> > +
> > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> > +                                  pud_t *pud)
> > +{
> > +     if (pgtable_l4_enabled) {
> > +             unsigned long pfn = virt_to_pfn(pud);
> > +
> > +             set_p4d_safe(p4d,
> > +                          __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> > +     }
> > +}
> > +
> > +#define pud_alloc_one pud_alloc_one
> > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return __pud_alloc_one(mm, addr);
> > +
> > +     return NULL;
> > +}
> > +
> > +#define pud_free pud_free
> > +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             __pud_free(mm, pud);
> > +}
> > +
> > +#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
> >  #endif /* __PAGETABLE_PMD_FOLDED */
> >
> >  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
> > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> > index 228261aa9628..bbbdd66e5e2f 100644
> > --- a/arch/riscv/include/asm/pgtable-64.h
> > +++ b/arch/riscv/include/asm/pgtable-64.h
> > @@ -8,16 +8,36 @@
> >
> >  #include <linux/const.h>
> >
> > -#define PGDIR_SHIFT     30
> > +extern bool pgtable_l4_enabled;
> > +
> > +#define PGDIR_SHIFT_L3  30
> > +#define PGDIR_SHIFT_L4  39
> > +#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
> > +
> > +#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
> >  /* Size of region mapped by a page global directory */
> >  #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
> >  #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
> >
> > +/* pud is folded into pgd in case of 3-level page table */
> > +#define PUD_SHIFT      30
> > +#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
> > +#define PUD_MASK       (~(PUD_SIZE - 1))
> > +
> >  #define PMD_SHIFT       21
> >  /* Size of region mapped by a page middle directory */
> >  #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
> >  #define PMD_MASK        (~(PMD_SIZE - 1))
> >
> > +/* Page Upper Directory entry */
> > +typedef struct {
> > +     unsigned long pud;
> > +} pud_t;
> > +
> > +#define pud_val(x)      ((x).pud)
> > +#define __pud(x)        ((pud_t) { (x) })
> > +#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
> > +
> >  /* Page Middle Directory entry */
> >  typedef struct {
> >       unsigned long pmd;
> > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
> >       set_pud(pudp, __pud(0));
> >  }
> >
> > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> > +{
> > +     return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> > +}
> > +
> > +static inline unsigned long _pud_pfn(pud_t pud)
> > +{
> > +     return pud_val(pud) >> _PAGE_PFN_SHIFT;
> > +}
> > +
> >  static inline pmd_t *pud_pgtable(pud_t pud)
> >  {
> >       return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
> >       return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
> >  }
> >
> > +#define mm_pud_folded  mm_pud_folded
> > +static inline bool mm_pud_folded(struct mm_struct *mm)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return false;
> > +
> > +     return true;
> > +}
> > +
> > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
> > +
> >  static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
> >  {
> >       return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
> >  #define pmd_ERROR(e) \
> >       pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
> >
> > +#define pud_ERROR(e)   \
> > +     pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> > +
> > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             *p4dp = p4d;
> > +     else
> > +             set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> > +}
> > +
> > +static inline int p4d_none(p4d_t p4d)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return (p4d_val(p4d) == 0);
> > +
> > +     return 0;
> > +}
> > +
> > +static inline int p4d_present(p4d_t p4d)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return (p4d_val(p4d) & _PAGE_PRESENT);
> > +
> > +     return 1;
> > +}
> > +
> > +static inline int p4d_bad(p4d_t p4d)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return !p4d_present(p4d);
> > +
> > +     return 0;
> > +}
> > +
> > +static inline void p4d_clear(p4d_t *p4d)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             set_p4d(p4d, __p4d(0));
> > +}
> > +
> > +static inline pud_t *p4d_pgtable(p4d_t p4d)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> > +
> > +     return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
> > +}
> > +
> > +static inline struct page *p4d_page(p4d_t p4d)
> > +{
> > +     return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> > +}
> > +
> > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> > +
> > +#define pud_offset pud_offset
> > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> > +{
> > +     if (pgtable_l4_enabled)
> > +             return p4d_pgtable(*p4d) + pud_index(address);
> > +
> > +     return (pud_t *)p4d;
> > +}
> > +
> >  #endif /* _ASM_RISCV_PGTABLE_64_H */
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index e1a52e22ad7e..e1c74ef4ead2 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -51,7 +51,7 @@
> >   * position vmemmap directly below the VMALLOC region.
> >   */
> >  #ifdef CONFIG_64BIT
> > -#define VA_BITS              39
> > +#define VA_BITS              (pgtable_l4_enabled ? 48 : 39)
> >  #else
> >  #define VA_BITS              32
> >  #endif
> > @@ -90,8 +90,7 @@
> >
> >  #ifndef __ASSEMBLY__
> >
> > -/* Page Upper Directory not used in RISC-V */
> > -#include <asm-generic/pgtable-nopud.h>
> > +#include <asm-generic/pgtable-nop4d.h>
> >  #include <asm/page.h>
> >  #include <asm/tlbflush.h>
> >  #include <linux/mm_types.h>
> > @@ -113,6 +112,17 @@
> >  #define XIP_FIXUP(addr)              (addr)
> >  #endif /* CONFIG_XIP_KERNEL */
> >
> > +struct pt_alloc_ops {
> > +     pte_t *(*get_pte_virt)(phys_addr_t pa);
> > +     phys_addr_t (*alloc_pte)(uintptr_t va);
> > +#ifndef __PAGETABLE_PMD_FOLDED
> > +     pmd_t *(*get_pmd_virt)(phys_addr_t pa);
> > +     phys_addr_t (*alloc_pmd)(uintptr_t va);
> > +     pud_t *(*get_pud_virt)(phys_addr_t pa);
> > +     phys_addr_t (*alloc_pud)(uintptr_t va);
> > +#endif
> > +};
> > +
> >  #ifdef CONFIG_MMU
> >  /* Number of entries in the page global directory */
> >  #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
> > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
> >   * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
> >   */
> >  #ifdef CONFIG_64BIT
> > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
> > +#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
> > +#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
> >  #else
> > -#define TASK_SIZE FIXADDR_START
> > +#define TASK_SIZE    FIXADDR_START
> > +#define TASK_SIZE_MIN        TASK_SIZE
> >  #endif
> >
> >  #else /* CONFIG_MMU */
> > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa;
> >  #define dtb_early_va _dtb_early_va
> >  #define dtb_early_pa _dtb_early_pa
> >  #endif /* CONFIG_XIP_KERNEL */
> > +extern u64 satp_mode;
> > +extern bool pgtable_l4_enabled;
> >
> >  void paging_init(void);
> >  void misc_mem_init(void);
> > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> > index 52c5ff9804c5..c3c0ed559770 100644
> > --- a/arch/riscv/kernel/head.S
> > +++ b/arch/riscv/kernel/head.S
> > @@ -95,7 +95,8 @@ relocate:
> >
> >       /* Compute satp for kernel page tables, but don't load it yet */
> >       srl a2, a0, PAGE_SHIFT
> > -     li a1, SATP_MODE
> > +     la a1, satp_mode
> > +     REG_L a1, 0(a1)
> >       or a2, a2, a1
> >
> >       /*
> > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> > index ee3459cb6750..a7246872bd30 100644
> > --- a/arch/riscv/mm/context.c
> > +++ b/arch/riscv/mm/context.c
> > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
> >  switch_mm_fast:
> >       csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
> >                 ((cntx & asid_mask) << SATP_ASID_SHIFT) |
> > -               SATP_MODE);
> > +               satp_mode);
> >
> >       if (need_flush_tlb)
> >               local_flush_tlb_all();
> > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
> >  static void set_mm_noasid(struct mm_struct *mm)
> >  {
> >       /* Switch the page table and blindly nuke entire local TLB */
> > -     csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
> > +     csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
> >       local_flush_tlb_all();
> >  }
> >
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index 1552226fb6bd..6a19a1b1caf8 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map);
> >  #define kernel_map   (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
> >  #endif
> >
> > +#ifdef CONFIG_64BIT
> > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
> > +#else
> > +u64 satp_mode = SATP_MODE_32;
> > +#endif
> > +EXPORT_SYMBOL(satp_mode);
> > +
> > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ?
> > +                             true : false;
>
> Hi Alex,
>
> I'm not sure whether we can use static key for pgtable_l4_enabled or
> not. Obviously, for a specific HW platform, pgtable_l4_enabled won't change
> after boot, and it seems it sits hot code path, so IMHO, static key maybe
> suitable for it.

Thanks for the suggestion, I'll explore that after this series is
merged if you don't mind.

Thanks,

Alex

>
> Thanks
>
Nick Kossifidis April 26, 2022, 5:57 a.m. UTC | #7
Hello Alex,

On 12/6/21 12:46, Alexandre Ghiti wrote:
> 
> +#ifdef CONFIG_64BIT
> +static void __init disable_pgtable_l4(void)
> +{
> +	pgtable_l4_enabled = false;
> +	kernel_map.page_offset = PAGE_OFFSET_L3;
> +	satp_mode = SATP_MODE_39;
> +}
> +
> +/*
> + * There is a simple way to determine if 4-level is supported by the
> + * underlying hardware: establish 1:1 mapping in 4-level page table mode
> + * then read SATP to see if the configuration was taken into account
> + * meaning sv48 is supported.
> + */
> +static __init void set_satp_mode(void)
> +{
> +	u64 identity_satp, hw_satp;
> +	uintptr_t set_satp_mode_pmd;
> +
> +	set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
> +	create_pgd_mapping(early_pg_dir,
> +			   set_satp_mode_pmd, (uintptr_t)early_pud,
> +			   PGDIR_SIZE, PAGE_TABLE);
> +	create_pud_mapping(early_pud,
> +			   set_satp_mode_pmd, (uintptr_t)early_pmd,
> +			   PUD_SIZE, PAGE_TABLE);
> +	/* Handle the case where set_satp_mode straddles 2 PMDs */
> +	create_pmd_mapping(early_pmd,
> +			   set_satp_mode_pmd, set_satp_mode_pmd,
> +			   PMD_SIZE, PAGE_KERNEL_EXEC);
> +	create_pmd_mapping(early_pmd,
> +			   set_satp_mode_pmd + PMD_SIZE,
> +			   set_satp_mode_pmd + PMD_SIZE,
> +			   PMD_SIZE, PAGE_KERNEL_EXEC);
> +
> +	identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> +
> +	local_flush_tlb_all();
> +	csr_write(CSR_SATP, identity_satp);
> +	hw_satp = csr_swap(CSR_SATP, 0ULL);
> +	local_flush_tlb_all();
> +
> +	if (hw_satp != identity_satp)
> +		disable_pgtable_l4();
> +
> +	memset(early_pg_dir, 0, PAGE_SIZE);
> +	memset(early_pud, 0, PAGE_SIZE);
> +	memset(early_pmd, 0, PAGE_SIZE);
> +}
> +#endif
> +

When doing the 1:1 mapping you don't take into account the limitation 
that all bits above 47 need to have the same value as bit 47. If the 
kernel exists at a high physical address with bit 47 set the 
corresponding virtual address will be invalid, resulting an instruction 
fetch fault as the privilege spec mandates. We verified this bug on our 
prototype. I suggest we re-write this in assembly and do a proper satp 
switch like we do on head.S, so that we don't need the 1:1 mapping and 
we also have a way to recover in case this fails.

Regards,
Nick
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index ac6c0cd9bc29..d28fe0148e13 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -150,7 +150,7 @@  config PAGE_OFFSET
 	hex
 	default 0xC0000000 if 32BIT
 	default 0x80000000 if 64BIT && !MMU
-	default 0xffffffd800000000 if 64BIT
+	default 0xffffaf8000000000 if 64BIT
 
 config KASAN_SHADOW_OFFSET
 	hex
@@ -201,7 +201,7 @@  config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
 	int
-	default 3 if 64BIT
+	default 4 if 64BIT
 	default 2
 
 config LOCKDEP_SUPPORT
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 87ac65696871..3fdb971c7896 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -40,14 +40,13 @@ 
 #ifndef CONFIG_64BIT
 #define SATP_PPN	_AC(0x003FFFFF, UL)
 #define SATP_MODE_32	_AC(0x80000000, UL)
-#define SATP_MODE	SATP_MODE_32
 #define SATP_ASID_BITS	9
 #define SATP_ASID_SHIFT	22
 #define SATP_ASID_MASK	_AC(0x1FF, UL)
 #else
 #define SATP_PPN	_AC(0x00000FFFFFFFFFFF, UL)
 #define SATP_MODE_39	_AC(0x8000000000000000, UL)
-#define SATP_MODE	SATP_MODE_39
+#define SATP_MODE_48	_AC(0x9000000000000000, UL)
 #define SATP_ASID_BITS	16
 #define SATP_ASID_SHIFT	44
 #define SATP_ASID_MASK	_AC(0xFFFF, UL)
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 54cbf07fb4e9..58a718573ad6 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -24,6 +24,7 @@  enum fixed_addresses {
 	FIX_HOLE,
 	FIX_PTE,
 	FIX_PMD,
+	FIX_PUD,
 	FIX_TEXT_POKE1,
 	FIX_TEXT_POKE0,
 	FIX_EARLYCON_MEM_BASE,
diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index 743e6ff57996..0b85e363e778 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -28,7 +28,11 @@ 
 #define KASAN_SHADOW_SCALE_SHIFT	3
 
 #define KASAN_SHADOW_SIZE	(UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
-#define KASAN_SHADOW_START	(KASAN_SHADOW_END - KASAN_SHADOW_SIZE)
+/*
+ * Depending on the size of the virtual address space, the region may not be
+ * aligned on PGDIR_SIZE, so force its alignment to ease its population.
+ */
+#define KASAN_SHADOW_START	((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
 #define KASAN_SHADOW_END	MODULES_LOWEST_VADDR
 #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index e03559f9b35e..d089fe46f7d8 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -31,7 +31,20 @@ 
  * When not using MMU this corresponds to the first free page in
  * physical memory (aligned on a page boundary).
  */
+#ifdef CONFIG_64BIT
+#ifdef CONFIG_MMU
+#define PAGE_OFFSET		kernel_map.page_offset
+#else
+#define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
+#endif
+/*
+ * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
+ * define the PAGE_OFFSET value for SV39.
+ */
+#define PAGE_OFFSET_L3		_AC(0xffffffd800000000, UL)
+#else
 #define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
+#endif /* CONFIG_64BIT */
 
 /*
  * Half of the kernel address space (half of the entries of the page global
@@ -90,6 +103,7 @@  extern unsigned long riscv_pfn_base;
 #endif /* CONFIG_MMU */
 
 struct kernel_mapping {
+	unsigned long page_offset;
 	unsigned long virt_addr;
 	uintptr_t phys_addr;
 	uintptr_t size;
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 0af6933a7100..11823004b87a 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -11,6 +11,8 @@ 
 #include <asm/tlb.h>
 
 #ifdef CONFIG_MMU
+#define __HAVE_ARCH_PUD_ALLOC_ONE
+#define __HAVE_ARCH_PUD_FREE
 #include <asm-generic/pgalloc.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm,
@@ -36,6 +38,44 @@  static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 
 	set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
 }
+
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
+{
+	if (pgtable_l4_enabled) {
+		unsigned long pfn = virt_to_pfn(pud);
+
+		set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+	}
+}
+
+static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
+				     pud_t *pud)
+{
+	if (pgtable_l4_enabled) {
+		unsigned long pfn = virt_to_pfn(pud);
+
+		set_p4d_safe(p4d,
+			     __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+	}
+}
+
+#define pud_alloc_one pud_alloc_one
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	if (pgtable_l4_enabled)
+		return __pud_alloc_one(mm, addr);
+
+	return NULL;
+}
+
+#define pud_free pud_free
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	if (pgtable_l4_enabled)
+		__pud_free(mm, pud);
+}
+
+#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 228261aa9628..bbbdd66e5e2f 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -8,16 +8,36 @@ 
 
 #include <linux/const.h>
 
-#define PGDIR_SHIFT     30
+extern bool pgtable_l4_enabled;
+
+#define PGDIR_SHIFT_L3  30
+#define PGDIR_SHIFT_L4  39
+#define PGDIR_SIZE_L3   (_AC(1, UL) << PGDIR_SHIFT_L3)
+
+#define PGDIR_SHIFT     (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
 /* Size of region mapped by a page global directory */
 #define PGDIR_SIZE      (_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK      (~(PGDIR_SIZE - 1))
 
+/* pud is folded into pgd in case of 3-level page table */
+#define PUD_SHIFT      30
+#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK       (~(PUD_SIZE - 1))
+
 #define PMD_SHIFT       21
 /* Size of region mapped by a page middle directory */
 #define PMD_SIZE        (_AC(1, UL) << PMD_SHIFT)
 #define PMD_MASK        (~(PMD_SIZE - 1))
 
+/* Page Upper Directory entry */
+typedef struct {
+	unsigned long pud;
+} pud_t;
+
+#define pud_val(x)      ((x).pud)
+#define __pud(x)        ((pud_t) { (x) })
+#define PTRS_PER_PUD    (PAGE_SIZE / sizeof(pud_t))
+
 /* Page Middle Directory entry */
 typedef struct {
 	unsigned long pmd;
@@ -59,6 +79,16 @@  static inline void pud_clear(pud_t *pudp)
 	set_pud(pudp, __pud(0));
 }
 
+static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
+{
+	return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
+}
+
+static inline unsigned long _pud_pfn(pud_t pud)
+{
+	return pud_val(pud) >> _PAGE_PFN_SHIFT;
+}
+
 static inline pmd_t *pud_pgtable(pud_t pud)
 {
 	return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
@@ -69,6 +99,17 @@  static inline struct page *pud_page(pud_t pud)
 	return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
 }
 
+#define mm_pud_folded  mm_pud_folded
+static inline bool mm_pud_folded(struct mm_struct *mm)
+{
+	if (pgtable_l4_enabled)
+		return false;
+
+	return true;
+}
+
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+
 static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
 {
 	return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
@@ -84,4 +125,69 @@  static inline unsigned long _pmd_pfn(pmd_t pmd)
 #define pmd_ERROR(e) \
 	pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
 
+#define pud_ERROR(e)   \
+	pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+	if (pgtable_l4_enabled)
+		*p4dp = p4d;
+	else
+		set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
+}
+
+static inline int p4d_none(p4d_t p4d)
+{
+	if (pgtable_l4_enabled)
+		return (p4d_val(p4d) == 0);
+
+	return 0;
+}
+
+static inline int p4d_present(p4d_t p4d)
+{
+	if (pgtable_l4_enabled)
+		return (p4d_val(p4d) & _PAGE_PRESENT);
+
+	return 1;
+}
+
+static inline int p4d_bad(p4d_t p4d)
+{
+	if (pgtable_l4_enabled)
+		return !p4d_present(p4d);
+
+	return 0;
+}
+
+static inline void p4d_clear(p4d_t *p4d)
+{
+	if (pgtable_l4_enabled)
+		set_p4d(p4d, __p4d(0));
+}
+
+static inline pud_t *p4d_pgtable(p4d_t p4d)
+{
+	if (pgtable_l4_enabled)
+		return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
+
+	return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
+}
+
+static inline struct page *p4d_page(p4d_t p4d)
+{
+	return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
+}
+
+#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+#define pud_offset pud_offset
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+{
+	if (pgtable_l4_enabled)
+		return p4d_pgtable(*p4d) + pud_index(address);
+
+	return (pud_t *)p4d;
+}
+
 #endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index e1a52e22ad7e..e1c74ef4ead2 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -51,7 +51,7 @@ 
  * position vmemmap directly below the VMALLOC region.
  */
 #ifdef CONFIG_64BIT
-#define VA_BITS		39
+#define VA_BITS		(pgtable_l4_enabled ? 48 : 39)
 #else
 #define VA_BITS		32
 #endif
@@ -90,8 +90,7 @@ 
 
 #ifndef __ASSEMBLY__
 
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
+#include <asm-generic/pgtable-nop4d.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
 #include <linux/mm_types.h>
@@ -113,6 +112,17 @@ 
 #define XIP_FIXUP(addr)		(addr)
 #endif /* CONFIG_XIP_KERNEL */
 
+struct pt_alloc_ops {
+	pte_t *(*get_pte_virt)(phys_addr_t pa);
+	phys_addr_t (*alloc_pte)(uintptr_t va);
+#ifndef __PAGETABLE_PMD_FOLDED
+	pmd_t *(*get_pmd_virt)(phys_addr_t pa);
+	phys_addr_t (*alloc_pmd)(uintptr_t va);
+	pud_t *(*get_pud_virt)(phys_addr_t pa);
+	phys_addr_t (*alloc_pud)(uintptr_t va);
+#endif
+};
+
 #ifdef CONFIG_MMU
 /* Number of entries in the page global directory */
 #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
@@ -669,9 +679,11 @@  static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
  * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
  */
 #ifdef CONFIG_64BIT
-#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
+#define TASK_SIZE      (PGDIR_SIZE * PTRS_PER_PGD / 2)
+#define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
 #else
-#define TASK_SIZE FIXADDR_START
+#define TASK_SIZE	FIXADDR_START
+#define TASK_SIZE_MIN	TASK_SIZE
 #endif
 
 #else /* CONFIG_MMU */
@@ -697,6 +709,8 @@  extern uintptr_t _dtb_early_pa;
 #define dtb_early_va	_dtb_early_va
 #define dtb_early_pa	_dtb_early_pa
 #endif /* CONFIG_XIP_KERNEL */
+extern u64 satp_mode;
+extern bool pgtable_l4_enabled;
 
 void paging_init(void);
 void misc_mem_init(void);
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 52c5ff9804c5..c3c0ed559770 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -95,7 +95,8 @@  relocate:
 
 	/* Compute satp for kernel page tables, but don't load it yet */
 	srl a2, a0, PAGE_SHIFT
-	li a1, SATP_MODE
+	la a1, satp_mode
+	REG_L a1, 0(a1)
 	or a2, a2, a1
 
 	/*
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index ee3459cb6750..a7246872bd30 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -192,7 +192,7 @@  static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 switch_mm_fast:
 	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
 		  ((cntx & asid_mask) << SATP_ASID_SHIFT) |
-		  SATP_MODE);
+		  satp_mode);
 
 	if (need_flush_tlb)
 		local_flush_tlb_all();
@@ -201,7 +201,7 @@  static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 static void set_mm_noasid(struct mm_struct *mm)
 {
 	/* Switch the page table and blindly nuke entire local TLB */
-	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
+	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
 	local_flush_tlb_all();
 }
 
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 1552226fb6bd..6a19a1b1caf8 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -37,6 +37,17 @@  EXPORT_SYMBOL(kernel_map);
 #define kernel_map	(*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
 #endif
 
+#ifdef CONFIG_64BIT
+u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
+#else
+u64 satp_mode = SATP_MODE_32;
+#endif
+EXPORT_SYMBOL(satp_mode);
+
+bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ?
+				true : false;
+EXPORT_SYMBOL(pgtable_l4_enabled);
+
 phys_addr_t phys_ram_base __ro_after_init;
 EXPORT_SYMBOL(phys_ram_base);
 
@@ -53,15 +64,6 @@  extern char _start[];
 void *_dtb_early_va __initdata;
 uintptr_t _dtb_early_pa __initdata;
 
-struct pt_alloc_ops {
-	pte_t *(*get_pte_virt)(phys_addr_t pa);
-	phys_addr_t (*alloc_pte)(uintptr_t va);
-#ifndef __PAGETABLE_PMD_FOLDED
-	pmd_t *(*get_pmd_virt)(phys_addr_t pa);
-	phys_addr_t (*alloc_pmd)(uintptr_t va);
-#endif
-};
-
 static phys_addr_t dma32_phys_limit __initdata;
 
 static void __init zone_sizes_init(void)
@@ -222,7 +224,7 @@  static void __init setup_bootmem(void)
 }
 
 #ifdef CONFIG_MMU
-static struct pt_alloc_ops _pt_ops __initdata;
+struct pt_alloc_ops _pt_ops __initdata;
 
 #ifdef CONFIG_XIP_KERNEL
 #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
@@ -238,6 +240,7 @@  pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
 static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
 
 pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
 static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 
 #ifdef CONFIG_XIP_KERNEL
@@ -326,6 +329,16 @@  static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 #define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
 #endif /* CONFIG_XIP_KERNEL */
 
+static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
+
+#ifdef CONFIG_XIP_KERNEL
+#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud))
+#define fixmap_pud     ((pud_t *)XIP_FIXUP(fixmap_pud))
+#define early_pud      ((pud_t *)XIP_FIXUP(early_pud))
+#endif /* CONFIG_XIP_KERNEL */
+
 static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
 {
 	/* Before MMU is enabled */
@@ -345,7 +358,7 @@  static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
 
 static phys_addr_t __init alloc_pmd_early(uintptr_t va)
 {
-	BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
+	BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
 
 	return (uintptr_t)early_pmd;
 }
@@ -391,21 +404,97 @@  static void __init create_pmd_mapping(pmd_t *pmdp,
 	create_pte_mapping(ptep, va, pa, sz, prot);
 }
 
-#define pgd_next_t		pmd_t
-#define alloc_pgd_next(__va)	pt_ops.alloc_pmd(__va)
-#define get_pgd_next_virt(__pa)	pt_ops.get_pmd_virt(__pa)
+static pud_t *__init get_pud_virt_early(phys_addr_t pa)
+{
+	return (pud_t *)((uintptr_t)pa);
+}
+
+static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
+{
+	clear_fixmap(FIX_PUD);
+	return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
+}
+
+static pud_t *__init get_pud_virt_late(phys_addr_t pa)
+{
+	return (pud_t *)__va(pa);
+}
+
+static phys_addr_t __init alloc_pud_early(uintptr_t va)
+{
+	/* Only one PUD is available for early mapping */
+	BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
+
+	return (uintptr_t)early_pud;
+}
+
+static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
+{
+	return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static phys_addr_t alloc_pud_late(uintptr_t va)
+{
+	unsigned long vaddr;
+
+	vaddr = __get_free_page(GFP_KERNEL);
+	BUG_ON(!vaddr);
+	return __pa(vaddr);
+}
+
+static void __init create_pud_mapping(pud_t *pudp,
+				      uintptr_t va, phys_addr_t pa,
+				      phys_addr_t sz, pgprot_t prot)
+{
+	pmd_t *nextp;
+	phys_addr_t next_phys;
+	uintptr_t pud_index = pud_index(va);
+
+	if (sz == PUD_SIZE) {
+		if (pud_val(pudp[pud_index]) == 0)
+			pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
+		return;
+	}
+
+	if (pud_val(pudp[pud_index]) == 0) {
+		next_phys = pt_ops.alloc_pmd(va);
+		pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
+		nextp = pt_ops.get_pmd_virt(next_phys);
+		memset(nextp, 0, PAGE_SIZE);
+	} else {
+		next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
+		nextp = pt_ops.get_pmd_virt(next_phys);
+	}
+
+	create_pmd_mapping(nextp, va, pa, sz, prot);
+}
+
+#define pgd_next_t		pud_t
+#define alloc_pgd_next(__va)	(pgtable_l4_enabled ?			\
+		pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))
+#define get_pgd_next_virt(__pa)	(pgtable_l4_enabled ?			\
+		pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa))
 #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)	\
-	create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
-#define fixmap_pgd_next		fixmap_pmd
+				(pgtable_l4_enabled ?			\
+		create_pud_mapping(__nextp, __va, __pa, __sz, __prot) :	\
+		create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))
+#define fixmap_pgd_next		(pgtable_l4_enabled ?			\
+		(uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
+#define trampoline_pgd_next	(pgtable_l4_enabled ?			\
+		(uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
+#define early_dtb_pgd_next	(pgtable_l4_enabled ?			\
+		(uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)
 #else
 #define pgd_next_t		pte_t
 #define alloc_pgd_next(__va)	pt_ops.alloc_pte(__va)
 #define get_pgd_next_virt(__pa)	pt_ops.get_pte_virt(__pa)
 #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)	\
 	create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
-#define fixmap_pgd_next		fixmap_pte
+#define fixmap_pgd_next		((uintptr_t)fixmap_pte)
+#define early_dtb_pgd_next	((uintptr_t)early_dtb_pmd)
+#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot)
 #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot)
-#endif
+#endif /* __PAGETABLE_PMD_FOLDED */
 
 void __init create_pgd_mapping(pgd_t *pgdp,
 				      uintptr_t va, phys_addr_t pa,
@@ -493,6 +582,57 @@  static __init pgprot_t pgprot_from_va(uintptr_t va)
 }
 #endif /* CONFIG_STRICT_KERNEL_RWX */
 
+#ifdef CONFIG_64BIT
+static void __init disable_pgtable_l4(void)
+{
+	pgtable_l4_enabled = false;
+	kernel_map.page_offset = PAGE_OFFSET_L3;
+	satp_mode = SATP_MODE_39;
+}
+
+/*
+ * There is a simple way to determine if 4-level is supported by the
+ * underlying hardware: establish 1:1 mapping in 4-level page table mode
+ * then read SATP to see if the configuration was taken into account
+ * meaning sv48 is supported.
+ */
+static __init void set_satp_mode(void)
+{
+	u64 identity_satp, hw_satp;
+	uintptr_t set_satp_mode_pmd;
+
+	set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
+	create_pgd_mapping(early_pg_dir,
+			   set_satp_mode_pmd, (uintptr_t)early_pud,
+			   PGDIR_SIZE, PAGE_TABLE);
+	create_pud_mapping(early_pud,
+			   set_satp_mode_pmd, (uintptr_t)early_pmd,
+			   PUD_SIZE, PAGE_TABLE);
+	/* Handle the case where set_satp_mode straddles 2 PMDs */
+	create_pmd_mapping(early_pmd,
+			   set_satp_mode_pmd, set_satp_mode_pmd,
+			   PMD_SIZE, PAGE_KERNEL_EXEC);
+	create_pmd_mapping(early_pmd,
+			   set_satp_mode_pmd + PMD_SIZE,
+			   set_satp_mode_pmd + PMD_SIZE,
+			   PMD_SIZE, PAGE_KERNEL_EXEC);
+
+	identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
+
+	local_flush_tlb_all();
+	csr_write(CSR_SATP, identity_satp);
+	hw_satp = csr_swap(CSR_SATP, 0ULL);
+	local_flush_tlb_all();
+
+	if (hw_satp != identity_satp)
+		disable_pgtable_l4();
+
+	memset(early_pg_dir, 0, PAGE_SIZE);
+	memset(early_pud, 0, PAGE_SIZE);
+	memset(early_pmd, 0, PAGE_SIZE);
+}
+#endif
+
 /*
  * setup_vm() is called from head.S with MMU-off.
  *
@@ -557,10 +697,15 @@  static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
 	uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1);
 
 	create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA,
-			   IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa,
+			   IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa,
 			   PGDIR_SIZE,
 			   IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL);
 
+	if (pgtable_l4_enabled) {
+		create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA,
+				   (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE);
+	}
+
 	if (IS_ENABLED(CONFIG_64BIT)) {
 		create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
 				   pa, PMD_SIZE, PAGE_KERNEL);
@@ -593,6 +738,8 @@  void pt_ops_set_early(void)
 #ifndef __PAGETABLE_PMD_FOLDED
 	pt_ops.alloc_pmd = alloc_pmd_early;
 	pt_ops.get_pmd_virt = get_pmd_virt_early;
+	pt_ops.alloc_pud = alloc_pud_early;
+	pt_ops.get_pud_virt = get_pud_virt_early;
 #endif
 }
 
@@ -611,6 +758,8 @@  void pt_ops_set_fixmap(void)
 #ifndef __PAGETABLE_PMD_FOLDED
 	pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap);
 	pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap);
+	pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap);
+	pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap);
 #endif
 }
 
@@ -625,6 +774,8 @@  void pt_ops_set_late(void)
 #ifndef __PAGETABLE_PMD_FOLDED
 	pt_ops.alloc_pmd = alloc_pmd_late;
 	pt_ops.get_pmd_virt = get_pmd_virt_late;
+	pt_ops.alloc_pud = alloc_pud_late;
+	pt_ops.get_pud_virt = get_pud_virt_late;
 #endif
 }
 
@@ -633,6 +784,7 @@  asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 	pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd;
 
 	kernel_map.virt_addr = KERNEL_LINK_ADDR;
+	kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
 
 #ifdef CONFIG_XIP_KERNEL
 	kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
@@ -647,6 +799,11 @@  asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 	kernel_map.phys_addr = (uintptr_t)(&_start);
 	kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
 #endif
+
+#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
+	set_satp_mode();
+#endif
+
 	kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
 	kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
 
@@ -676,15 +833,21 @@  asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 
 	/* Setup early PGD for fixmap */
 	create_pgd_mapping(early_pg_dir, FIXADDR_START,
-			   (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
+			   fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
 
 #ifndef __PAGETABLE_PMD_FOLDED
-	/* Setup fixmap PMD */
+	/* Setup fixmap PUD and PMD */
+	if (pgtable_l4_enabled)
+		create_pud_mapping(fixmap_pud, FIXADDR_START,
+				   (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
 	create_pmd_mapping(fixmap_pmd, FIXADDR_START,
 			   (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
 	/* Setup trampoline PGD and PMD */
 	create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
-			   (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
+			   trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
+	if (pgtable_l4_enabled)
+		create_pud_mapping(trampoline_pud, kernel_map.virt_addr,
+				   (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
 #ifdef CONFIG_XIP_KERNEL
 	create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
 			   kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
@@ -712,7 +875,7 @@  asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 	 * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap
 	 * range can not span multiple pmds.
 	 */
-	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+	BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
 		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -783,9 +946,10 @@  static void __init setup_vm_final(void)
 	/* Clear fixmap PTE and PMD mappings */
 	clear_fixmap(FIX_PTE);
 	clear_fixmap(FIX_PMD);
+	clear_fixmap(FIX_PUD);
 
 	/* Move to swapper page table */
-	csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
+	csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
 	local_flush_tlb_all();
 
 	pt_ops_set_late();
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 1434a0225140..993f50571a3b 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -11,7 +11,29 @@ 
 #include <asm/fixmap.h>
 #include <asm/pgalloc.h>
 
+/*
+ * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57
+ * which is right before the kernel.
+ *
+ * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate
+ * the page global directory with kasan_early_shadow_pmd.
+ *
+ * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping
+ * must be divided as follows:
+ * - the first PGD entry, although incomplete, is populated with
+ *   kasan_early_shadow_pud/p4d
+ * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d
+ * - the last PGD entry is shared with the kernel mapping so populated at the
+ *   lower levels pud/p4d
+ *
+ * In addition, when shallow populating a kasan region (for example vmalloc),
+ * this region may also not be aligned on PGDIR size, so we must go down to the
+ * pud level too.
+ */
+
 extern pgd_t early_pg_dir[PTRS_PER_PGD];
+extern struct pt_alloc_ops _pt_ops __initdata;
+#define pt_ops	_pt_ops
 
 static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
 {
@@ -35,15 +57,19 @@  static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
 	set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
 }
 
-static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end)
 {
 	phys_addr_t phys_addr;
 	pmd_t *pmdp, *base_pmd;
 	unsigned long next;
 
-	base_pmd = (pmd_t *)pgd_page_vaddr(*pgd);
-	if (base_pmd == lm_alias(kasan_early_shadow_pmd))
+	if (pud_none(*pud)) {
 		base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
+	} else {
+		base_pmd = (pmd_t *)pud_pgtable(*pud);
+		if (base_pmd == lm_alias(kasan_early_shadow_pmd))
+			base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
+	}
 
 	pmdp = base_pmd + pmd_index(vaddr);
 
@@ -67,9 +93,72 @@  static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned
 	 * it entirely, memblock could allocate a page at a physical address
 	 * where KASAN is not populated yet and then we'd get a page fault.
 	 */
-	set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
+	set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
+}
+
+static void __init kasan_populate_pud(pgd_t *pgd,
+				      unsigned long vaddr, unsigned long end,
+				      bool early)
+{
+	phys_addr_t phys_addr;
+	pud_t *pudp, *base_pud;
+	unsigned long next;
+
+	if (early) {
+		/*
+		 * We can't use pgd_page_vaddr here as it would return a linear
+		 * mapping address but it is not mapped yet, but when populating
+		 * early_pg_dir, we need the physical address and when populating
+		 * swapper_pg_dir, we need the kernel virtual address so use
+		 * pt_ops facility.
+		 */
+		base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd)));
+	} else {
+		base_pud = (pud_t *)pgd_page_vaddr(*pgd);
+		if (base_pud == lm_alias(kasan_early_shadow_pud))
+			base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
+	}
+
+	pudp = base_pud + pud_index(vaddr);
+
+	do {
+		next = pud_addr_end(vaddr, end);
+
+		if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) {
+			if (early) {
+				phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd));
+				set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE));
+				continue;
+			} else {
+				phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE);
+				if (phys_addr) {
+					set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL));
+					continue;
+				}
+			}
+		}
+
+		kasan_populate_pmd(pudp, vaddr, next);
+	} while (pudp++, vaddr = next, vaddr != end);
+
+	/*
+	 * Wait for the whole PGD to be populated before setting the PGD in
+	 * the page table, otherwise, if we did set the PGD before populating
+	 * it entirely, memblock could allocate a page at a physical address
+	 * where KASAN is not populated yet and then we'd get a page fault.
+	 */
+	if (!early)
+		set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE));
 }
 
+#define kasan_early_shadow_pgd_next			(pgtable_l4_enabled ?	\
+				(uintptr_t)kasan_early_shadow_pud :		\
+				(uintptr_t)kasan_early_shadow_pmd)
+#define kasan_populate_pgd_next(pgdp, vaddr, next, early)			\
+		(pgtable_l4_enabled ?						\
+			kasan_populate_pud(pgdp, vaddr, next, early) :		\
+			kasan_populate_pmd((pud_t *)pgdp, vaddr, next))
+
 static void __init kasan_populate_pgd(pgd_t *pgdp,
 				      unsigned long vaddr, unsigned long end,
 				      bool early)
@@ -102,7 +191,7 @@  static void __init kasan_populate_pgd(pgd_t *pgdp,
 			}
 		}
 
-		kasan_populate_pmd(pgdp, vaddr, next);
+		kasan_populate_pgd_next(pgdp, vaddr, next, early);
 	} while (pgdp++, vaddr = next, vaddr != end);
 }
 
@@ -157,18 +246,54 @@  static void __init kasan_populate(void *start, void *end)
 	memset(start, KASAN_SHADOW_INIT, end - start);
 }
 
+static void __init kasan_shallow_populate_pud(pgd_t *pgdp,
+					      unsigned long vaddr, unsigned long end,
+					      bool kasan_populate)
+{
+	unsigned long next;
+	pud_t *pudp, *base_pud;
+	pmd_t *base_pmd;
+	bool is_kasan_pmd;
+
+	base_pud = (pud_t *)pgd_page_vaddr(*pgdp);
+	pudp = base_pud + pud_index(vaddr);
+
+	if (kasan_populate)
+		memcpy(base_pud, (void *)kasan_early_shadow_pgd_next,
+		       sizeof(pud_t) * PTRS_PER_PUD);
+
+	do {
+		next = pud_addr_end(vaddr, end);
+		is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd));
+
+		if (is_kasan_pmd) {
+			base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+			set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
+		}
+	} while (pudp++, vaddr = next, vaddr != end);
+}
+
 static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
 {
 	unsigned long next;
 	void *p;
 	pgd_t *pgd_k = pgd_offset_k(vaddr);
+	bool is_kasan_pgd_next;
 
 	do {
 		next = pgd_addr_end(vaddr, end);
-		if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
+		is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) ==
+				     (unsigned long)lm_alias(kasan_early_shadow_pgd_next));
+
+		if (is_kasan_pgd_next) {
 			p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 			set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
 		}
+
+		if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE)
+			continue;
+
+		kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next);
 	} while (pgd_k++, vaddr = next, vaddr != end);
 }
 
diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
index 26e69788f27a..b3db5d91ed38 100644
--- a/drivers/firmware/efi/libstub/efi-stub.c
+++ b/drivers/firmware/efi/libstub/efi-stub.c
@@ -40,6 +40,8 @@ 
 
 #ifdef CONFIG_ARM64
 # define EFI_RT_VIRTUAL_LIMIT	DEFAULT_MAP_WINDOW_64
+#elif defined(CONFIG_RISCV)
+# define EFI_RT_VIRTUAL_LIMIT	TASK_SIZE_MIN
 #else
 # define EFI_RT_VIRTUAL_LIMIT	TASK_SIZE
 #endif