Message ID | 20211206104657.433304-8-alexandre.ghiti@canonical.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce sv48 support without relocatable kernel | expand |
On 12/6/21 11:46, Alexandre Ghiti wrote: > By adding a new 4th level of page table, give the possibility to 64bit > kernel to address 2^48 bytes of virtual address: in practice, that offers > 128TB of virtual address space to userspace and allows up to 64TB of > physical memory. > > If the underlying hardware does not support sv48, we will automatically > fallback to a standard 3-level page table by folding the new PUD level into > PGDIR level. In order to detect HW capabilities at runtime, we > use SATP feature that ignores writes with an unsupported mode. > > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> > --- > arch/riscv/Kconfig | 4 +- > arch/riscv/include/asm/csr.h | 3 +- > arch/riscv/include/asm/fixmap.h | 1 + > arch/riscv/include/asm/kasan.h | 6 +- > arch/riscv/include/asm/page.h | 14 ++ > arch/riscv/include/asm/pgalloc.h | 40 +++++ > arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- > arch/riscv/include/asm/pgtable.h | 24 ++- > arch/riscv/kernel/head.S | 3 +- > arch/riscv/mm/context.c | 4 +- > arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- > arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- > drivers/firmware/efi/libstub/efi-stub.c | 2 + > 13 files changed, 514 insertions(+), 44 deletions(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index ac6c0cd9bc29..d28fe0148e13 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -150,7 +150,7 @@ config PAGE_OFFSET > hex > default 0xC0000000 if 32BIT > default 0x80000000 if 64BIT && !MMU > - default 0xffffffd800000000 if 64BIT > + default 0xffffaf8000000000 if 64BIT > > config KASAN_SHADOW_OFFSET > hex > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM > > config PGTABLE_LEVELS > int > - default 3 if 64BIT > + default 4 if 64BIT > default 2 > > config LOCKDEP_SUPPORT > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h > index 87ac65696871..3fdb971c7896 100644 > --- a/arch/riscv/include/asm/csr.h > +++ b/arch/riscv/include/asm/csr.h > @@ -40,14 +40,13 @@ > #ifndef CONFIG_64BIT > #define SATP_PPN _AC(0x003FFFFF, UL) > #define SATP_MODE_32 _AC(0x80000000, UL) > -#define SATP_MODE SATP_MODE_32 > #define SATP_ASID_BITS 9 > #define SATP_ASID_SHIFT 22 > #define SATP_ASID_MASK _AC(0x1FF, UL) > #else > #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) > #define SATP_MODE_39 _AC(0x8000000000000000, UL) > -#define SATP_MODE SATP_MODE_39 > +#define SATP_MODE_48 _AC(0x9000000000000000, UL) > #define SATP_ASID_BITS 16 > #define SATP_ASID_SHIFT 44 > #define SATP_ASID_MASK _AC(0xFFFF, UL) > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h > index 54cbf07fb4e9..58a718573ad6 100644 > --- a/arch/riscv/include/asm/fixmap.h > +++ b/arch/riscv/include/asm/fixmap.h > @@ -24,6 +24,7 @@ enum fixed_addresses { > FIX_HOLE, > FIX_PTE, > FIX_PMD, > + FIX_PUD, > FIX_TEXT_POKE1, > FIX_TEXT_POKE0, > FIX_EARLYCON_MEM_BASE, > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h > index 743e6ff57996..0b85e363e778 100644 > --- a/arch/riscv/include/asm/kasan.h > +++ b/arch/riscv/include/asm/kasan.h > @@ -28,7 +28,11 @@ > #define KASAN_SHADOW_SCALE_SHIFT 3 > > #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) > -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) > +/* > + * Depending on the size of the virtual address space, the region may not be > + * aligned on PGDIR_SIZE, so force its alignment to ease its population. > + */ > +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) > #define KASAN_SHADOW_END MODULES_LOWEST_VADDR > #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index e03559f9b35e..d089fe46f7d8 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -31,7 +31,20 @@ > * When not using MMU this corresponds to the first free page in > * physical memory (aligned on a page boundary). > */ > +#ifdef CONFIG_64BIT > +#ifdef CONFIG_MMU > +#define PAGE_OFFSET kernel_map.page_offset > +#else > +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > +#endif > +/* > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so > + * define the PAGE_OFFSET value for SV39. > + */ > +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) > +#else > #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > +#endif /* CONFIG_64BIT */ > > /* > * Half of the kernel address space (half of the entries of the page global > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; > #endif /* CONFIG_MMU */ > > struct kernel_mapping { > + unsigned long page_offset; > unsigned long virt_addr; > uintptr_t phys_addr; > uintptr_t size; > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h > index 0af6933a7100..11823004b87a 100644 > --- a/arch/riscv/include/asm/pgalloc.h > +++ b/arch/riscv/include/asm/pgalloc.h > @@ -11,6 +11,8 @@ > #include <asm/tlb.h> > > #ifdef CONFIG_MMU > +#define __HAVE_ARCH_PUD_ALLOC_ONE > +#define __HAVE_ARCH_PUD_FREE > #include <asm-generic/pgalloc.h> > > static inline void pmd_populate_kernel(struct mm_struct *mm, > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > > set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > } > + > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) > +{ > + if (pgtable_l4_enabled) { > + unsigned long pfn = virt_to_pfn(pud); > + > + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > + } > +} > + > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, > + pud_t *pud) > +{ > + if (pgtable_l4_enabled) { > + unsigned long pfn = virt_to_pfn(pud); > + > + set_p4d_safe(p4d, > + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > + } > +} > + > +#define pud_alloc_one pud_alloc_one > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) > +{ > + if (pgtable_l4_enabled) > + return __pud_alloc_one(mm, addr); > + > + return NULL; > +} > + > +#define pud_free pud_free > +static inline void pud_free(struct mm_struct *mm, pud_t *pud) > +{ > + if (pgtable_l4_enabled) > + __pud_free(mm, pud); > +} > + > +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) > #endif /* __PAGETABLE_PMD_FOLDED */ > > static inline pgd_t *pgd_alloc(struct mm_struct *mm) > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h > index 228261aa9628..bbbdd66e5e2f 100644 > --- a/arch/riscv/include/asm/pgtable-64.h > +++ b/arch/riscv/include/asm/pgtable-64.h > @@ -8,16 +8,36 @@ > > #include <linux/const.h> > > -#define PGDIR_SHIFT 30 > +extern bool pgtable_l4_enabled; > + > +#define PGDIR_SHIFT_L3 30 > +#define PGDIR_SHIFT_L4 39 > +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) > + > +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) > /* Size of region mapped by a page global directory */ > #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) > #define PGDIR_MASK (~(PGDIR_SIZE - 1)) > > +/* pud is folded into pgd in case of 3-level page table */ > +#define PUD_SHIFT 30 > +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) > +#define PUD_MASK (~(PUD_SIZE - 1)) > + > #define PMD_SHIFT 21 > /* Size of region mapped by a page middle directory */ > #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) > #define PMD_MASK (~(PMD_SIZE - 1)) > > +/* Page Upper Directory entry */ > +typedef struct { > + unsigned long pud; > +} pud_t; > + > +#define pud_val(x) ((x).pud) > +#define __pud(x) ((pud_t) { (x) }) > +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) > + > /* Page Middle Directory entry */ > typedef struct { > unsigned long pmd; > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) > set_pud(pudp, __pud(0)); > } > > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) > +{ > + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > +} > + > +static inline unsigned long _pud_pfn(pud_t pud) > +{ > + return pud_val(pud) >> _PAGE_PFN_SHIFT; > +} > + > static inline pmd_t *pud_pgtable(pud_t pud) > { > return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) > return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); > } > > +#define mm_pud_folded mm_pud_folded > +static inline bool mm_pud_folded(struct mm_struct *mm) > +{ > + if (pgtable_l4_enabled) > + return false; > + > + return true; > +} > + > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) > + > static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) > { > return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) > #define pmd_ERROR(e) \ > pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) > > +#define pud_ERROR(e) \ > + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) > + > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + *p4dp = p4d; > + else > + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); > +} > + > +static inline int p4d_none(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (p4d_val(p4d) == 0); > + > + return 0; > +} > + > +static inline int p4d_present(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (p4d_val(p4d) & _PAGE_PRESENT); > + > + return 1; > +} > + > +static inline int p4d_bad(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return !p4d_present(p4d); > + > + return 0; > +} > + > +static inline void p4d_clear(p4d_t *p4d) > +{ > + if (pgtable_l4_enabled) > + set_p4d(p4d, __p4d(0)); > +} > + > +static inline pud_t *p4d_pgtable(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > + > + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); > +} > + > +static inline struct page *p4d_page(p4d_t p4d) > +{ > + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > +} > + > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) > + > +#define pud_offset pud_offset > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) > +{ > + if (pgtable_l4_enabled) > + return p4d_pgtable(*p4d) + pud_index(address); > + > + return (pud_t *)p4d; > +} > + > #endif /* _ASM_RISCV_PGTABLE_64_H */ > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > index e1a52e22ad7e..e1c74ef4ead2 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -51,7 +51,7 @@ > * position vmemmap directly below the VMALLOC region. > */ > #ifdef CONFIG_64BIT > -#define VA_BITS 39 > +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) > #else > #define VA_BITS 32 > #endif > @@ -90,8 +90,7 @@ > > #ifndef __ASSEMBLY__ > > -/* Page Upper Directory not used in RISC-V */ > -#include <asm-generic/pgtable-nopud.h> > +#include <asm-generic/pgtable-nop4d.h> > #include <asm/page.h> > #include <asm/tlbflush.h> > #include <linux/mm_types.h> > @@ -113,6 +112,17 @@ > #define XIP_FIXUP(addr) (addr) > #endif /* CONFIG_XIP_KERNEL */ > > +struct pt_alloc_ops { > + pte_t *(*get_pte_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pte)(uintptr_t va); > +#ifndef __PAGETABLE_PMD_FOLDED > + pmd_t *(*get_pmd_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pmd)(uintptr_t va); > + pud_t *(*get_pud_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pud)(uintptr_t va); > +#endif > +}; > + > #ifdef CONFIG_MMU > /* Number of entries in the page global directory */ > #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > */ > #ifdef CONFIG_64BIT > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) > #else > -#define TASK_SIZE FIXADDR_START > +#define TASK_SIZE FIXADDR_START > +#define TASK_SIZE_MIN TASK_SIZE > #endif > > #else /* CONFIG_MMU */ > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; > #define dtb_early_va _dtb_early_va > #define dtb_early_pa _dtb_early_pa > #endif /* CONFIG_XIP_KERNEL */ > +extern u64 satp_mode; > +extern bool pgtable_l4_enabled; > > void paging_init(void); > void misc_mem_init(void); > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index 52c5ff9804c5..c3c0ed559770 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -95,7 +95,8 @@ relocate: > > /* Compute satp for kernel page tables, but don't load it yet */ > srl a2, a0, PAGE_SHIFT > - li a1, SATP_MODE > + la a1, satp_mode > + REG_L a1, 0(a1) > or a2, a2, a1 > > /* > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c > index ee3459cb6750..a7246872bd30 100644 > --- a/arch/riscv/mm/context.c > +++ b/arch/riscv/mm/context.c > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > switch_mm_fast: > csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | > ((cntx & asid_mask) << SATP_ASID_SHIFT) | > - SATP_MODE); > + satp_mode); > > if (need_flush_tlb) > local_flush_tlb_all(); > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > static void set_mm_noasid(struct mm_struct *mm) > { > /* Switch the page table and blindly nuke entire local TLB */ > - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); > + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); > local_flush_tlb_all(); > } > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 1552226fb6bd..6a19a1b1caf8 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); > #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) > #endif > > +#ifdef CONFIG_64BIT > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; > +#else > +u64 satp_mode = SATP_MODE_32; > +#endif > +EXPORT_SYMBOL(satp_mode); > + > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? > + true : false; > +EXPORT_SYMBOL(pgtable_l4_enabled); > + > phys_addr_t phys_ram_base __ro_after_init; > EXPORT_SYMBOL(phys_ram_base); > > @@ -53,15 +64,6 @@ extern char _start[]; > void *_dtb_early_va __initdata; > uintptr_t _dtb_early_pa __initdata; > > -struct pt_alloc_ops { > - pte_t *(*get_pte_virt)(phys_addr_t pa); > - phys_addr_t (*alloc_pte)(uintptr_t va); > -#ifndef __PAGETABLE_PMD_FOLDED > - pmd_t *(*get_pmd_virt)(phys_addr_t pa); > - phys_addr_t (*alloc_pmd)(uintptr_t va); > -#endif > -}; > - > static phys_addr_t dma32_phys_limit __initdata; > > static void __init zone_sizes_init(void) > @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) > } > > #ifdef CONFIG_MMU > -static struct pt_alloc_ops _pt_ops __initdata; > +struct pt_alloc_ops _pt_ops __initdata; > > #ifdef CONFIG_XIP_KERNEL > #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) > @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; > static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; > > pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); > +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); > static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); > > #ifdef CONFIG_XIP_KERNEL > @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); > #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) > #endif /* CONFIG_XIP_KERNEL */ > > +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; > +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; > +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); > + > +#ifdef CONFIG_XIP_KERNEL > +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) > +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) > +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) > +#endif /* CONFIG_XIP_KERNEL */ > + > static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) > { > /* Before MMU is enabled */ > @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) > > static phys_addr_t __init alloc_pmd_early(uintptr_t va) > { > - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); > + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); > > return (uintptr_t)early_pmd; > } > @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, > create_pte_mapping(ptep, va, pa, sz, prot); > } > > -#define pgd_next_t pmd_t > -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) > -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) > +static pud_t *__init get_pud_virt_early(phys_addr_t pa) > +{ > + return (pud_t *)((uintptr_t)pa); > +} > + > +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) > +{ > + clear_fixmap(FIX_PUD); > + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); > +} > + > +static pud_t *__init get_pud_virt_late(phys_addr_t pa) > +{ > + return (pud_t *)__va(pa); > +} > + > +static phys_addr_t __init alloc_pud_early(uintptr_t va) > +{ > + /* Only one PUD is available for early mapping */ > + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); > + > + return (uintptr_t)early_pud; > +} > + > +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) > +{ > + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); > +} > + > +static phys_addr_t alloc_pud_late(uintptr_t va) > +{ > + unsigned long vaddr; > + > + vaddr = __get_free_page(GFP_KERNEL); > + BUG_ON(!vaddr); > + return __pa(vaddr); > +} > + > +static void __init create_pud_mapping(pud_t *pudp, > + uintptr_t va, phys_addr_t pa, > + phys_addr_t sz, pgprot_t prot) > +{ > + pmd_t *nextp; > + phys_addr_t next_phys; > + uintptr_t pud_index = pud_index(va); > + > + if (sz == PUD_SIZE) { > + if (pud_val(pudp[pud_index]) == 0) > + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); > + return; > + } > + > + if (pud_val(pudp[pud_index]) == 0) { > + next_phys = pt_ops.alloc_pmd(va); > + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); > + nextp = pt_ops.get_pmd_virt(next_phys); > + memset(nextp, 0, PAGE_SIZE); > + } else { > + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); > + nextp = pt_ops.get_pmd_virt(next_phys); > + } > + > + create_pmd_mapping(nextp, va, pa, sz, prot); > +} > + > +#define pgd_next_t pud_t > +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ > + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) > +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ > + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ > - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) > -#define fixmap_pgd_next fixmap_pmd > + (pgtable_l4_enabled ? \ > + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ > + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) > +#define fixmap_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) > +#define trampoline_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) > +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) > #else > #define pgd_next_t pte_t > #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) > #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ > create_pte_mapping(__nextp, __va, __pa, __sz, __prot) > -#define fixmap_pgd_next fixmap_pte > +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) > +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) > +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) > #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) > -#endif > +#endif /* __PAGETABLE_PMD_FOLDED */ > > void __init create_pgd_mapping(pgd_t *pgdp, > uintptr_t va, phys_addr_t pa, > @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) > } > #endif /* CONFIG_STRICT_KERNEL_RWX */ > > +#ifdef CONFIG_64BIT > +static void __init disable_pgtable_l4(void) > +{ > + pgtable_l4_enabled = false; > + kernel_map.page_offset = PAGE_OFFSET_L3; > + satp_mode = SATP_MODE_39; > +} > + > +/* > + * There is a simple way to determine if 4-level is supported by the > + * underlying hardware: establish 1:1 mapping in 4-level page table mode > + * then read SATP to see if the configuration was taken into account > + * meaning sv48 is supported. > + */ > +static __init void set_satp_mode(void) > +{ > + u64 identity_satp, hw_satp; > + uintptr_t set_satp_mode_pmd; > + > + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; > + create_pgd_mapping(early_pg_dir, > + set_satp_mode_pmd, (uintptr_t)early_pud, > + PGDIR_SIZE, PAGE_TABLE); > + create_pud_mapping(early_pud, > + set_satp_mode_pmd, (uintptr_t)early_pmd, > + PUD_SIZE, PAGE_TABLE); > + /* Handle the case where set_satp_mode straddles 2 PMDs */ > + create_pmd_mapping(early_pmd, > + set_satp_mode_pmd, set_satp_mode_pmd, > + PMD_SIZE, PAGE_KERNEL_EXEC); > + create_pmd_mapping(early_pmd, > + set_satp_mode_pmd + PMD_SIZE, > + set_satp_mode_pmd + PMD_SIZE, > + PMD_SIZE, PAGE_KERNEL_EXEC); > + > + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; > + > + local_flush_tlb_all(); > + csr_write(CSR_SATP, identity_satp); > + hw_satp = csr_swap(CSR_SATP, 0ULL); > + local_flush_tlb_all(); > + > + if (hw_satp != identity_satp) > + disable_pgtable_l4(); > + > + memset(early_pg_dir, 0, PAGE_SIZE); > + memset(early_pud, 0, PAGE_SIZE); > + memset(early_pmd, 0, PAGE_SIZE); > +} > +#endif > + > /* > * setup_vm() is called from head.S with MMU-off. > * > @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) > uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); > > create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, > - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, > + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, > PGDIR_SIZE, > IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); > > + if (pgtable_l4_enabled) { > + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, > + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); > + } > + > if (IS_ENABLED(CONFIG_64BIT)) { > create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, > pa, PMD_SIZE, PAGE_KERNEL); > @@ -593,6 +738,8 @@ void pt_ops_set_early(void) > #ifndef __PAGETABLE_PMD_FOLDED > pt_ops.alloc_pmd = alloc_pmd_early; > pt_ops.get_pmd_virt = get_pmd_virt_early; > + pt_ops.alloc_pud = alloc_pud_early; > + pt_ops.get_pud_virt = get_pud_virt_early; > #endif > } > > @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void) > #ifndef __PAGETABLE_PMD_FOLDED > pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); > pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); > + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); > + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); > #endif > } > > @@ -625,6 +774,8 @@ void pt_ops_set_late(void) > #ifndef __PAGETABLE_PMD_FOLDED > pt_ops.alloc_pmd = alloc_pmd_late; > pt_ops.get_pmd_virt = get_pmd_virt_late; > + pt_ops.alloc_pud = alloc_pud_late; > + pt_ops.get_pud_virt = get_pud_virt_late; > #endif > } > > @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; > > kernel_map.virt_addr = KERNEL_LINK_ADDR; > + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); > > #ifdef CONFIG_XIP_KERNEL > kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; > @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > kernel_map.phys_addr = (uintptr_t)(&_start); > kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; > #endif > + > +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) > + set_satp_mode(); > +#endif > + > kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; > kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; > > @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > /* Setup early PGD for fixmap */ > create_pgd_mapping(early_pg_dir, FIXADDR_START, > - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); > + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); > > #ifndef __PAGETABLE_PMD_FOLDED > - /* Setup fixmap PMD */ > + /* Setup fixmap PUD and PMD */ > + if (pgtable_l4_enabled) > + create_pud_mapping(fixmap_pud, FIXADDR_START, > + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); > create_pmd_mapping(fixmap_pmd, FIXADDR_START, > (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); > /* Setup trampoline PGD and PMD */ > create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, > - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); > + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); > + if (pgtable_l4_enabled) > + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, > + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); > #ifdef CONFIG_XIP_KERNEL > create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, > kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); > @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap > * range can not span multiple pmds. > */ > - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) > + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) > != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); > > #ifndef __PAGETABLE_PMD_FOLDED > @@ -783,9 +946,10 @@ static void __init setup_vm_final(void) > /* Clear fixmap PTE and PMD mappings */ > clear_fixmap(FIX_PTE); > clear_fixmap(FIX_PMD); > + clear_fixmap(FIX_PUD); > > /* Move to swapper page table */ > - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); > + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); > local_flush_tlb_all(); > > pt_ops_set_late(); > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c > index 1434a0225140..993f50571a3b 100644 > --- a/arch/riscv/mm/kasan_init.c > +++ b/arch/riscv/mm/kasan_init.c > @@ -11,7 +11,29 @@ > #include <asm/fixmap.h> > #include <asm/pgalloc.h> > > +/* > + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 > + * which is right before the kernel. > + * > + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate > + * the page global directory with kasan_early_shadow_pmd. > + * > + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping > + * must be divided as follows: > + * - the first PGD entry, although incomplete, is populated with > + * kasan_early_shadow_pud/p4d > + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d > + * - the last PGD entry is shared with the kernel mapping so populated at the > + * lower levels pud/p4d > + * > + * In addition, when shallow populating a kasan region (for example vmalloc), > + * this region may also not be aligned on PGDIR size, so we must go down to the > + * pud level too. > + */ > + > extern pgd_t early_pg_dir[PTRS_PER_PGD]; > +extern struct pt_alloc_ops _pt_ops __initdata; > +#define pt_ops _pt_ops > > static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) > { > @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned > set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); > } > > -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) > +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) > { > phys_addr_t phys_addr; > pmd_t *pmdp, *base_pmd; > unsigned long next; > > - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); > - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) > + if (pud_none(*pud)) { > base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); > + } else { > + base_pmd = (pmd_t *)pud_pgtable(*pud); > + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) > + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); > + } > > pmdp = base_pmd + pmd_index(vaddr); > > @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned > * it entirely, memblock could allocate a page at a physical address > * where KASAN is not populated yet and then we'd get a page fault. > */ > - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > +} > + > +static void __init kasan_populate_pud(pgd_t *pgd, > + unsigned long vaddr, unsigned long end, > + bool early) > +{ > + phys_addr_t phys_addr; > + pud_t *pudp, *base_pud; > + unsigned long next; > + > + if (early) { > + /* > + * We can't use pgd_page_vaddr here as it would return a linear > + * mapping address but it is not mapped yet, but when populating > + * early_pg_dir, we need the physical address and when populating > + * swapper_pg_dir, we need the kernel virtual address so use > + * pt_ops facility. > + */ > + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); > + } else { > + base_pud = (pud_t *)pgd_page_vaddr(*pgd); > + if (base_pud == lm_alias(kasan_early_shadow_pud)) > + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); > + } > + > + pudp = base_pud + pud_index(vaddr); > + > + do { > + next = pud_addr_end(vaddr, end); > + > + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { > + if (early) { > + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); > + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); > + continue; > + } else { > + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); > + if (phys_addr) { > + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); > + continue; > + } > + } > + } > + > + kasan_populate_pmd(pudp, vaddr, next); > + } while (pudp++, vaddr = next, vaddr != end); > + > + /* > + * Wait for the whole PGD to be populated before setting the PGD in > + * the page table, otherwise, if we did set the PGD before populating > + * it entirely, memblock could allocate a page at a physical address > + * where KASAN is not populated yet and then we'd get a page fault. > + */ > + if (!early) > + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); > } > > +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)kasan_early_shadow_pud : \ > + (uintptr_t)kasan_early_shadow_pmd) > +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ > + (pgtable_l4_enabled ? \ > + kasan_populate_pud(pgdp, vaddr, next, early) : \ > + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) > + > static void __init kasan_populate_pgd(pgd_t *pgdp, > unsigned long vaddr, unsigned long end, > bool early) > @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, > } > } > > - kasan_populate_pmd(pgdp, vaddr, next); > + kasan_populate_pgd_next(pgdp, vaddr, next, early); > } while (pgdp++, vaddr = next, vaddr != end); > } > > @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) > memset(start, KASAN_SHADOW_INIT, end - start); > } > > +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, > + unsigned long vaddr, unsigned long end, > + bool kasan_populate) > +{ > + unsigned long next; > + pud_t *pudp, *base_pud; > + pmd_t *base_pmd; > + bool is_kasan_pmd; > + > + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); > + pudp = base_pud + pud_index(vaddr); > + > + if (kasan_populate) > + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, > + sizeof(pud_t) * PTRS_PER_PUD); > + > + do { > + next = pud_addr_end(vaddr, end); > + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); > + > + if (is_kasan_pmd) { > + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); > + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > + } > + } while (pudp++, vaddr = next, vaddr != end); > +} > + > static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) > { > unsigned long next; > void *p; > pgd_t *pgd_k = pgd_offset_k(vaddr); > + bool is_kasan_pgd_next; > > do { > next = pgd_addr_end(vaddr, end); > - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { > + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == > + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); > + > + if (is_kasan_pgd_next) { > p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); > set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); > } > + > + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) > + continue; > + > + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); > } while (pgd_k++, vaddr = next, vaddr != end); > } @Qinglin: I can deal with sv57 kasan population if needs be as it is a bit tricky and I think it would save you quite some time :) > > diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c > index 26e69788f27a..b3db5d91ed38 100644 > --- a/drivers/firmware/efi/libstub/efi-stub.c > +++ b/drivers/firmware/efi/libstub/efi-stub.c > @@ -40,6 +40,8 @@ > > #ifdef CONFIG_ARM64 > # define EFI_RT_VIRTUAL_LIMIT DEFAULT_MAP_WINDOW_64 > +#elif defined(CONFIG_RISCV) > +# define EFI_RT_VIRTUAL_LIMIT TASK_SIZE_MIN > #else > # define EFI_RT_VIRTUAL_LIMIT TASK_SIZE > #endif
Hi Alex, On 2021/12/6 19:05, Alexandre ghiti wrote: > On 12/6/21 11:46, Alexandre Ghiti wrote: >> By adding a new 4th level of page table, give the possibility to 64bit >> kernel to address 2^48 bytes of virtual address: in practice, that offers >> 128TB of virtual address space to userspace and allows up to 64TB of >> physical memory. >> >> If the underlying hardware does not support sv48, we will automatically >> fallback to a standard 3-level page table by folding the new PUD level into >> PGDIR level. In order to detect HW capabilities at runtime, we >> use SATP feature that ignores writes with an unsupported mode. >> >> Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> >> --- >>  arch/riscv/Kconfig                     |  4 +- >>  arch/riscv/include/asm/csr.h           |  3 +- >>  arch/riscv/include/asm/fixmap.h        |  1 + >>  arch/riscv/include/asm/kasan.h         |  6 +- >>  arch/riscv/include/asm/page.h          | 14 ++ >>  arch/riscv/include/asm/pgalloc.h       | 40 +++++ >>  arch/riscv/include/asm/pgtable-64.h    | 108 +++++++++++- >>  arch/riscv/include/asm/pgtable.h       | 24 ++- >>  arch/riscv/kernel/head.S               |  3 +- >>  arch/riscv/mm/context.c                |  4 +- >>  arch/riscv/mm/init.c                   | 212 +++++++++++++++++++++--- >>  arch/riscv/mm/kasan_init.c             | 137 ++++++++++++++- >>  drivers/firmware/efi/libstub/efi-stub.c |  2 + >>  13 files changed, 514 insertions(+), 44 deletions(-) >> >> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >> index ac6c0cd9bc29..d28fe0148e13 100644 >> --- a/arch/riscv/Kconfig >> +++ b/arch/riscv/Kconfig >> @@ -150,7 +150,7 @@ config PAGE_OFFSET >>      hex >>      default 0xC0000000 if 32BIT >>      default 0x80000000 if 64BIT && !MMU >> -   default 0xffffffd800000000 if 64BIT >> +   default 0xffffaf8000000000 if 64BIT >>    config KASAN_SHADOW_OFFSET >>      hex >> @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM >>    config PGTABLE_LEVELS >>      int >> -   default 3 if 64BIT >> +   default 4 if 64BIT >>      default 2 >>    config LOCKDEP_SUPPORT >> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h >> index 87ac65696871..3fdb971c7896 100644 >> --- a/arch/riscv/include/asm/csr.h >> +++ b/arch/riscv/include/asm/csr.h >> @@ -40,14 +40,13 @@ >>  #ifndef CONFIG_64BIT >>  #define SATP_PPN   _AC(0x003FFFFF, UL) >>  #define SATP_MODE_32   _AC(0x80000000, UL) >> -#define SATP_MODE   SATP_MODE_32 >>  #define SATP_ASID_BITS   9 >>  #define SATP_ASID_SHIFT   22 >>  #define SATP_ASID_MASK   _AC(0x1FF, UL) >>  #else >>  #define SATP_PPN   _AC(0x00000FFFFFFFFFFF, UL) >>  #define SATP_MODE_39   _AC(0x8000000000000000, UL) >> -#define SATP_MODE   SATP_MODE_39 >> +#define SATP_MODE_48   _AC(0x9000000000000000, UL) >>  #define SATP_ASID_BITS   16 >>  #define SATP_ASID_SHIFT   44 >>  #define SATP_ASID_MASK   _AC(0xFFFF, UL) >> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h >> index 54cbf07fb4e9..58a718573ad6 100644 >> --- a/arch/riscv/include/asm/fixmap.h >> +++ b/arch/riscv/include/asm/fixmap.h >> @@ -24,6 +24,7 @@ enum fixed_addresses { >>      FIX_HOLE, >>      FIX_PTE, >>      FIX_PMD, >> +   FIX_PUD, >>      FIX_TEXT_POKE1, >>      FIX_TEXT_POKE0, >>      FIX_EARLYCON_MEM_BASE, >> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h >> index 743e6ff57996..0b85e363e778 100644 >> --- a/arch/riscv/include/asm/kasan.h >> +++ b/arch/riscv/include/asm/kasan.h >> @@ -28,7 +28,11 @@ >>  #define KASAN_SHADOW_SCALE_SHIFT   3 >>    #define KASAN_SHADOW_SIZE   (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) >> -#define KASAN_SHADOW_START   (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) >> +/* >> + * Depending on the size of the virtual address space, the region may not be >> + * aligned on PGDIR_SIZE, so force its alignment to ease its population. >> + */ >> +#define KASAN_SHADOW_START   ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) >>  #define KASAN_SHADOW_END   MODULES_LOWEST_VADDR >>  #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) >>  diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h >> index e03559f9b35e..d089fe46f7d8 100644 >> --- a/arch/riscv/include/asm/page.h >> +++ b/arch/riscv/include/asm/page.h >> @@ -31,7 +31,20 @@ >>   * When not using MMU this corresponds to the first free page in >>   * physical memory (aligned on a page boundary). >>   */ >> +#ifdef CONFIG_64BIT >> +#ifdef CONFIG_MMU >> +#define PAGE_OFFSET       kernel_map.page_offset >> +#else >> +#define PAGE_OFFSET       _AC(CONFIG_PAGE_OFFSET, UL) >> +#endif >> +/* >> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so >> + * define the PAGE_OFFSET value for SV39. >> + */ >> +#define PAGE_OFFSET_L3       _AC(0xffffffd800000000, UL) >> +#else >>  #define PAGE_OFFSET       _AC(CONFIG_PAGE_OFFSET, UL) >> +#endif /* CONFIG_64BIT */ >>    /* >>   * Half of the kernel address space (half of the entries of the page global >> @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; >>  #endif /* CONFIG_MMU */ >>    struct kernel_mapping { >> +   unsigned long page_offset; >>      unsigned long virt_addr; >>      uintptr_t phys_addr; >>      uintptr_t size; >> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h >> index 0af6933a7100..11823004b87a 100644 >> --- a/arch/riscv/include/asm/pgalloc.h >> +++ b/arch/riscv/include/asm/pgalloc.h >> @@ -11,6 +11,8 @@ >>  #include <asm/tlb.h> >>    #ifdef CONFIG_MMU >> +#define __HAVE_ARCH_PUD_ALLOC_ONE >> +#define __HAVE_ARCH_PUD_FREE >>  #include <asm-generic/pgalloc.h> >>    static inline void pmd_populate_kernel(struct mm_struct *mm, >> @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) >>        set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); >>  } >> + >> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) >> +{ >> +   if (pgtable_l4_enabled) { >> +       unsigned long pfn = virt_to_pfn(pud); >> + >> +       set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); >> +   } >> +} >> + >> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, >> +                    pud_t *pud) >> +{ >> +   if (pgtable_l4_enabled) { >> +       unsigned long pfn = virt_to_pfn(pud); >> + >> +       set_p4d_safe(p4d, >> +                __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); >> +   } >> +} >> + >> +#define pud_alloc_one pud_alloc_one >> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) >> +{ >> +   if (pgtable_l4_enabled) >> +       return __pud_alloc_one(mm, addr); >> + >> +   return NULL; >> +} >> + >> +#define pud_free pud_free >> +static inline void pud_free(struct mm_struct *mm, pud_t *pud) >> +{ >> +   if (pgtable_l4_enabled) >> +       __pud_free(mm, pud); >> +} >> + >> +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) >>  #endif /* __PAGETABLE_PMD_FOLDED */ >>    static inline pgd_t *pgd_alloc(struct mm_struct *mm) >> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h >> index 228261aa9628..bbbdd66e5e2f 100644 >> --- a/arch/riscv/include/asm/pgtable-64.h >> +++ b/arch/riscv/include/asm/pgtable-64.h >> @@ -8,16 +8,36 @@ >>    #include <linux/const.h> >>  -#define PGDIR_SHIFT    30 >> +extern bool pgtable_l4_enabled; >> + >> +#define PGDIR_SHIFT_L3 30 >> +#define PGDIR_SHIFT_L4 39 >> +#define PGDIR_SIZE_L3  (_AC(1, UL) << PGDIR_SHIFT_L3) >> + >> +#define PGDIR_SHIFT    (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) >>  /* Size of region mapped by a page global directory */ >>  #define PGDIR_SIZE     (_AC(1, UL) << PGDIR_SHIFT) >>  #define PGDIR_MASK     (~(PGDIR_SIZE - 1)) >>  +/* pud is folded into pgd in case of 3-level page table */ >> +#define PUD_SHIFT     30 >> +#define PUD_SIZE      (_AC(1, UL) << PUD_SHIFT) >> +#define PUD_MASK      (~(PUD_SIZE - 1)) >> + >>  #define PMD_SHIFT      21 >>  /* Size of region mapped by a page middle directory */ >>  #define PMD_SIZE       (_AC(1, UL) << PMD_SHIFT) >>  #define PMD_MASK       (~(PMD_SIZE - 1)) >>  +/* Page Upper Directory entry */ >> +typedef struct { >> +   unsigned long pud; >> +} pud_t; >> + >> +#define pud_val(x)     ((x).pud) >> +#define __pud(x)       ((pud_t) { (x) }) >> +#define PTRS_PER_PUD   (PAGE_SIZE / sizeof(pud_t)) >> + >>  /* Page Middle Directory entry */ >>  typedef struct { >>      unsigned long pmd; >> @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) >>      set_pud(pudp, __pud(0)); >>  } >>  +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) >> +{ >> +   return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); >> +} >> + >> +static inline unsigned long _pud_pfn(pud_t pud) >> +{ >> +   return pud_val(pud) >> _PAGE_PFN_SHIFT; >> +} >> + >>  static inline pmd_t *pud_pgtable(pud_t pud) >>  { >>      return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); >> @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) >>      return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); >>  } >>  +#define mm_pud_folded mm_pud_folded >> +static inline bool mm_pud_folded(struct mm_struct *mm) >> +{ >> +   if (pgtable_l4_enabled) >> +       return false; >> + >> +   return true; >> +} >> + >> +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) >> + >>  static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) >>  { >>      return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); >> @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) >>  #define pmd_ERROR(e) \ >>      pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) >>  +#define pud_ERROR(e)  \ >> +   pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) >> + >> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) >> +{ >> +   if (pgtable_l4_enabled) >> +       *p4dp = p4d; >> +   else >> +       set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); >> +} >> + >> +static inline int p4d_none(p4d_t p4d) >> +{ >> +   if (pgtable_l4_enabled) >> +       return (p4d_val(p4d) == 0); >> + >> +   return 0; >> +} >> + >> +static inline int p4d_present(p4d_t p4d) >> +{ >> +   if (pgtable_l4_enabled) >> +       return (p4d_val(p4d) & _PAGE_PRESENT); >> + >> +   return 1; >> +} >> + >> +static inline int p4d_bad(p4d_t p4d) >> +{ >> +   if (pgtable_l4_enabled) >> +       return !p4d_present(p4d); >> + >> +   return 0; >> +} >> + >> +static inline void p4d_clear(p4d_t *p4d) >> +{ >> +   if (pgtable_l4_enabled) >> +       set_p4d(p4d, __p4d(0)); >> +} >> + >> +static inline pud_t *p4d_pgtable(p4d_t p4d) >> +{ >> +   if (pgtable_l4_enabled) >> +       return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); >> + >> +   return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); >> +} >> + >> +static inline struct page *p4d_page(p4d_t p4d) >> +{ >> +   return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); >> +} >> + >> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) >> + >> +#define pud_offset pud_offset >> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) >> +{ >> +   if (pgtable_l4_enabled) >> +       return p4d_pgtable(*p4d) + pud_index(address); >> + >> +   return (pud_t *)p4d; >> +} >> + >>  #endif /* _ASM_RISCV_PGTABLE_64_H */ >> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h >> index e1a52e22ad7e..e1c74ef4ead2 100644 >> --- a/arch/riscv/include/asm/pgtable.h >> +++ b/arch/riscv/include/asm/pgtable.h >> @@ -51,7 +51,7 @@ >>   * position vmemmap directly below the VMALLOC region. >>   */ >>  #ifdef CONFIG_64BIT >> -#define VA_BITS       39 >> +#define VA_BITS       (pgtable_l4_enabled ? 48 : 39) >>  #else >>  #define VA_BITS       32 >>  #endif >> @@ -90,8 +90,7 @@ >>    #ifndef __ASSEMBLY__ >>  -/* Page Upper Directory not used in RISC-V */ >> -#include <asm-generic/pgtable-nopud.h> >> +#include <asm-generic/pgtable-nop4d.h> >>  #include <asm/page.h> >>  #include <asm/tlbflush.h> >>  #include <linux/mm_types.h> >> @@ -113,6 +112,17 @@ >>  #define XIP_FIXUP(addr)       (addr) >>  #endif /* CONFIG_XIP_KERNEL */ >>  +struct pt_alloc_ops { >> +   pte_t *(*get_pte_virt)(phys_addr_t pa); >> +   phys_addr_t (*alloc_pte)(uintptr_t va); >> +#ifndef __PAGETABLE_PMD_FOLDED >> +   pmd_t *(*get_pmd_virt)(phys_addr_t pa); >> +   phys_addr_t (*alloc_pmd)(uintptr_t va); >> +   pud_t *(*get_pud_virt)(phys_addr_t pa); >> +   phys_addr_t (*alloc_pud)(uintptr_t va); >> +#endif >> +}; >> + >>  #ifdef CONFIG_MMU >>  /* Number of entries in the page global directory */ >>  #define PTRS_PER_PGD   (PAGE_SIZE / sizeof(pgd_t)) >> @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, >>   * Note that PGDIR_SIZE must evenly divide TASK_SIZE. >>   */ >>  #ifdef CONFIG_64BIT >> -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) >> +#define TASK_SIZE     (PGDIR_SIZE * PTRS_PER_PGD / 2) >> +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) >>  #else >> -#define TASK_SIZE FIXADDR_START >> +#define TASK_SIZE   FIXADDR_START >> +#define TASK_SIZE_MIN   TASK_SIZE >>  #endif >>    #else /* CONFIG_MMU */ >> @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; >>  #define dtb_early_va   _dtb_early_va >>  #define dtb_early_pa   _dtb_early_pa >>  #endif /* CONFIG_XIP_KERNEL */ >> +extern u64 satp_mode; >> +extern bool pgtable_l4_enabled; >>    void paging_init(void); >>  void misc_mem_init(void); >> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S >> index 52c5ff9804c5..c3c0ed559770 100644 >> --- a/arch/riscv/kernel/head.S >> +++ b/arch/riscv/kernel/head.S >> @@ -95,7 +95,8 @@ relocate: >>        /* Compute satp for kernel page tables, but don't load it yet */ >>      srl a2, a0, PAGE_SHIFT >> -   li a1, SATP_MODE >> +   la a1, satp_mode >> +   REG_L a1, 0(a1) >>      or a2, a2, a1 >>        /* >> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c >> index ee3459cb6750..a7246872bd30 100644 >> --- a/arch/riscv/mm/context.c >> +++ b/arch/riscv/mm/context.c >> @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) >>  switch_mm_fast: >>      csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | >>            ((cntx & asid_mask) << SATP_ASID_SHIFT) | >> -         SATP_MODE); >> +         satp_mode); >>        if (need_flush_tlb) >>          local_flush_tlb_all(); >> @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) >>  static void set_mm_noasid(struct mm_struct *mm) >>  { >>      /* Switch the page table and blindly nuke entire local TLB */ >> -   csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); >> +   csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); >>      local_flush_tlb_all(); >>  } >>  diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 1552226fb6bd..6a19a1b1caf8 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); >>  #define kernel_map   (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) >>  #endif >>  +#ifdef CONFIG_64BIT >> +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; >> +#else >> +u64 satp_mode = SATP_MODE_32; >> +#endif >> +EXPORT_SYMBOL(satp_mode); >> + >> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? >> +               true : false; >> +EXPORT_SYMBOL(pgtable_l4_enabled); >> + >>  phys_addr_t phys_ram_base __ro_after_init; >>  EXPORT_SYMBOL(phys_ram_base); >>  @@ -53,15 +64,6 @@ extern char _start[]; >>  void *_dtb_early_va __initdata; >>  uintptr_t _dtb_early_pa __initdata; >>  -struct pt_alloc_ops { >> -   pte_t *(*get_pte_virt)(phys_addr_t pa); >> -   phys_addr_t (*alloc_pte)(uintptr_t va); >> -#ifndef __PAGETABLE_PMD_FOLDED >> -   pmd_t *(*get_pmd_virt)(phys_addr_t pa); >> -   phys_addr_t (*alloc_pmd)(uintptr_t va); >> -#endif >> -}; >> - >>  static phys_addr_t dma32_phys_limit __initdata; >>    static void __init zone_sizes_init(void) >> @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) >>  } >>    #ifdef CONFIG_MMU >> -static struct pt_alloc_ops _pt_ops __initdata; >> +struct pt_alloc_ops _pt_ops __initdata; >>    #ifdef CONFIG_XIP_KERNEL >>  #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) >> @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; >>  static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; >>    pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); >> +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); >>  static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); >>    #ifdef CONFIG_XIP_KERNEL >> @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); >>  #define early_pmd     ((pmd_t *)XIP_FIXUP(early_pmd)) >>  #endif /* CONFIG_XIP_KERNEL */ >>  +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; >> +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; >> +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); >> + >> +#ifdef CONFIG_XIP_KERNEL >> +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) >> +#define fixmap_pud    ((pud_t *)XIP_FIXUP(fixmap_pud)) >> +#define early_pud     ((pud_t *)XIP_FIXUP(early_pud)) >> +#endif /* CONFIG_XIP_KERNEL */ >> + >>  static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) >>  { >>      /* Before MMU is enabled */ >> @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) >>    static phys_addr_t __init alloc_pmd_early(uintptr_t va) >>  { >> -   BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); >> +   BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); >>        return (uintptr_t)early_pmd; >>  } >> @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, >>      create_pte_mapping(ptep, va, pa, sz, prot); >>  } >>  -#define pgd_next_t       pmd_t >> -#define alloc_pgd_next(__va)   pt_ops.alloc_pmd(__va) >> -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) >> +static pud_t *__init get_pud_virt_early(phys_addr_t pa) >> +{ >> +   return (pud_t *)((uintptr_t)pa); >> +} >> + >> +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) >> +{ >> +   clear_fixmap(FIX_PUD); >> +   return (pud_t *)set_fixmap_offset(FIX_PUD, pa); >> +} >> + >> +static pud_t *__init get_pud_virt_late(phys_addr_t pa) >> +{ >> +   return (pud_t *)__va(pa); >> +} >> + >> +static phys_addr_t __init alloc_pud_early(uintptr_t va) >> +{ >> +   /* Only one PUD is available for early mapping */ >> +   BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); >> + >> +   return (uintptr_t)early_pud; >> +} >> + >> +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) >> +{ >> +   return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >> +} >> + >> +static phys_addr_t alloc_pud_late(uintptr_t va) >> +{ >> +   unsigned long vaddr; >> + >> +   vaddr = __get_free_page(GFP_KERNEL); >> +   BUG_ON(!vaddr); >> +   return __pa(vaddr); >> +} >> + >> +static void __init create_pud_mapping(pud_t *pudp, >> +                     uintptr_t va, phys_addr_t pa, >> +                     phys_addr_t sz, pgprot_t prot) >> +{ >> +   pmd_t *nextp; >> +   phys_addr_t next_phys; >> +   uintptr_t pud_index = pud_index(va); >> + >> +   if (sz == PUD_SIZE) { >> +       if (pud_val(pudp[pud_index]) == 0) >> +           pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); >> +       return; >> +   } >> + >> +   if (pud_val(pudp[pud_index]) == 0) { >> +       next_phys = pt_ops.alloc_pmd(va); >> +       pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); >> +       nextp = pt_ops.get_pmd_virt(next_phys); >> +       memset(nextp, 0, PAGE_SIZE); >> +   } else { >> +       next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); >> +       nextp = pt_ops.get_pmd_virt(next_phys); >> +   } >> + >> +   create_pmd_mapping(nextp, va, pa, sz, prot); >> +} >> + >> +#define pgd_next_t       pud_t >> +#define alloc_pgd_next(__va)   (pgtable_l4_enabled ?           \ >> +       pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) >> +#define get_pgd_next_virt(__pa)   (pgtable_l4_enabled ?           \ >> +       pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) >>  #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)   \ >> -   create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) >> -#define fixmap_pgd_next       fixmap_pmd >> +               (pgtable_l4_enabled ?           \ >> +       create_pud_mapping(__nextp, __va, __pa, __sz, __prot) :   \ >> +       create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) >> +#define fixmap_pgd_next       (pgtable_l4_enabled ?           \ >> +       (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) >> +#define trampoline_pgd_next   (pgtable_l4_enabled ?           \ >> +       (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) >> +#define early_dtb_pgd_next   (pgtable_l4_enabled ?           \ >> +       (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) >>  #else >>  #define pgd_next_t       pte_t >>  #define alloc_pgd_next(__va)   pt_ops.alloc_pte(__va) >>  #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) >>  #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot)   \ >>      create_pte_mapping(__nextp, __va, __pa, __sz, __prot) >> -#define fixmap_pgd_next       fixmap_pte >> +#define fixmap_pgd_next       ((uintptr_t)fixmap_pte) >> +#define early_dtb_pgd_next   ((uintptr_t)early_dtb_pmd) >> +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) >>  #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) >> -#endif >> +#endif /* __PAGETABLE_PMD_FOLDED */ >>    void __init create_pgd_mapping(pgd_t *pgdp, >>                        uintptr_t va, phys_addr_t pa, >> @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) >>  } >>  #endif /* CONFIG_STRICT_KERNEL_RWX */ >>  +#ifdef CONFIG_64BIT >> +static void __init disable_pgtable_l4(void) >> +{ >> +   pgtable_l4_enabled = false; >> +   kernel_map.page_offset = PAGE_OFFSET_L3; >> +   satp_mode = SATP_MODE_39; >> +} >> + >> +/* >> + * There is a simple way to determine if 4-level is supported by the >> + * underlying hardware: establish 1:1 mapping in 4-level page table mode >> + * then read SATP to see if the configuration was taken into account >> + * meaning sv48 is supported. >> + */ >> +static __init void set_satp_mode(void) >> +{ >> +   u64 identity_satp, hw_satp; >> +   uintptr_t set_satp_mode_pmd; >> + >> +   set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; >> +   create_pgd_mapping(early_pg_dir, >> +              set_satp_mode_pmd, (uintptr_t)early_pud, >> +              PGDIR_SIZE, PAGE_TABLE); >> +   create_pud_mapping(early_pud, >> +              set_satp_mode_pmd, (uintptr_t)early_pmd, >> +              PUD_SIZE, PAGE_TABLE); >> +   /* Handle the case where set_satp_mode straddles 2 PMDs */ >> +   create_pmd_mapping(early_pmd, >> +              set_satp_mode_pmd, set_satp_mode_pmd, >> +              PMD_SIZE, PAGE_KERNEL_EXEC); >> +   create_pmd_mapping(early_pmd, >> +              set_satp_mode_pmd + PMD_SIZE, >> +              set_satp_mode_pmd + PMD_SIZE, >> +              PMD_SIZE, PAGE_KERNEL_EXEC); >> + >> +   identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; >> + >> +   local_flush_tlb_all(); >> +   csr_write(CSR_SATP, identity_satp); >> +   hw_satp = csr_swap(CSR_SATP, 0ULL); >> +   local_flush_tlb_all(); >> + >> +   if (hw_satp != identity_satp) >> +       disable_pgtable_l4(); >> + >> +   memset(early_pg_dir, 0, PAGE_SIZE); >> +   memset(early_pud, 0, PAGE_SIZE); >> +   memset(early_pmd, 0, PAGE_SIZE); >> +} >> +#endif >> + >>  /* >>   * setup_vm() is called from head.S with MMU-off. >>   * >> @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) >>      uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); >>        create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, >> -              IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, >> +              IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, >>                 PGDIR_SIZE, >>                 IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); >>  +   if (pgtable_l4_enabled) { >> +       create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, >> +                  (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); >> +   } >> + >>      if (IS_ENABLED(CONFIG_64BIT)) { >>          create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, >>                     pa, PMD_SIZE, PAGE_KERNEL); >> @@ -593,6 +738,8 @@ void pt_ops_set_early(void) >>  #ifndef __PAGETABLE_PMD_FOLDED >>      pt_ops.alloc_pmd = alloc_pmd_early; >>      pt_ops.get_pmd_virt = get_pmd_virt_early; >> +   pt_ops.alloc_pud = alloc_pud_early; >> +   pt_ops.get_pud_virt = get_pud_virt_early; >>  #endif >>  } >>  @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void) >>  #ifndef __PAGETABLE_PMD_FOLDED >>      pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); >>      pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); >> +   pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); >> +   pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); >>  #endif >>  } >>  @@ -625,6 +774,8 @@ void pt_ops_set_late(void) >>  #ifndef __PAGETABLE_PMD_FOLDED >>      pt_ops.alloc_pmd = alloc_pmd_late; >>      pt_ops.get_pmd_virt = get_pmd_virt_late; >> +   pt_ops.alloc_pud = alloc_pud_late; >> +   pt_ops.get_pud_virt = get_pud_virt_late; >>  #endif >>  } >>  @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>      pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; >>        kernel_map.virt_addr = KERNEL_LINK_ADDR; >> +   kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); >>    #ifdef CONFIG_XIP_KERNEL >>      kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; >> @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>      kernel_map.phys_addr = (uintptr_t)(&_start); >>      kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; >>  #endif >> + >> +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) >> +   set_satp_mode(); >> +#endif >> + >>      kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; >>      kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; >>  @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>        /* Setup early PGD for fixmap */ >>      create_pgd_mapping(early_pg_dir, FIXADDR_START, >> -              (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); >> +              fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); >>    #ifndef __PAGETABLE_PMD_FOLDED >> -   /* Setup fixmap PMD */ >> +   /* Setup fixmap PUD and PMD */ >> +   if (pgtable_l4_enabled) >> +       create_pud_mapping(fixmap_pud, FIXADDR_START, >> +                  (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); >>      create_pmd_mapping(fixmap_pmd, FIXADDR_START, >>                 (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); >>      /* Setup trampoline PGD and PMD */ >>      create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, >> -              (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); >> +              trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); >> +   if (pgtable_l4_enabled) >> +       create_pud_mapping(trampoline_pud, kernel_map.virt_addr, >> +                  (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); >>  #ifdef CONFIG_XIP_KERNEL >>      create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, >>                 kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); >> @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>       * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap >>       * range can not span multiple pmds. >>       */ >> -   BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) >> +   BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) >>               != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); >>    #ifndef __PAGETABLE_PMD_FOLDED >> @@ -783,9 +946,10 @@ static void __init setup_vm_final(void) >>      /* Clear fixmap PTE and PMD mappings */ >>      clear_fixmap(FIX_PTE); >>      clear_fixmap(FIX_PMD); >> +   clear_fixmap(FIX_PUD); >>        /* Move to swapper page table */ >> -   csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); >> +   csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); >>      local_flush_tlb_all(); >>        pt_ops_set_late(); >> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c >> index 1434a0225140..993f50571a3b 100644 >> --- a/arch/riscv/mm/kasan_init.c >> +++ b/arch/riscv/mm/kasan_init.c >> @@ -11,7 +11,29 @@ >>  #include <asm/fixmap.h> >>  #include <asm/pgalloc.h> >>  +/* >> + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 >> + * which is right before the kernel. >> + * >> + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate >> + * the page global directory with kasan_early_shadow_pmd. >> + * >> + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping >> + * must be divided as follows: >> + * - the first PGD entry, although incomplete, is populated with >> + *  kasan_early_shadow_pud/p4d >> + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d >> + * - the last PGD entry is shared with the kernel mapping so populated at the >> + *  lower levels pud/p4d >> + * >> + * In addition, when shallow populating a kasan region (for example vmalloc), >> + * this region may also not be aligned on PGDIR size, so we must go down to the >> + * pud level too. >> + */ >> + >>  extern pgd_t early_pg_dir[PTRS_PER_PGD]; >> +extern struct pt_alloc_ops _pt_ops __initdata; >> +#define pt_ops   _pt_ops >>    static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) >>  { >> @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned >>      set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); >>  } >>  -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) >> +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) >>  { >>      phys_addr_t phys_addr; >>      pmd_t *pmdp, *base_pmd; >>      unsigned long next; >>  -   base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); >> -   if (base_pmd == lm_alias(kasan_early_shadow_pmd)) >> +   if (pud_none(*pud)) { >>          base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); >> +   } else { >> +       base_pmd = (pmd_t *)pud_pgtable(*pud); >> +       if (base_pmd == lm_alias(kasan_early_shadow_pmd)) >> +           base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); >> +   } >>        pmdp = base_pmd + pmd_index(vaddr); >>  @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned >>       * it entirely, memblock could allocate a page at a physical address >>       * where KASAN is not populated yet and then we'd get a page fault. >>       */ >> -   set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); >> +   set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); >> +} >> + >> +static void __init kasan_populate_pud(pgd_t *pgd, >> +                     unsigned long vaddr, unsigned long end, >> +                     bool early) >> +{ >> +   phys_addr_t phys_addr; >> +   pud_t *pudp, *base_pud; >> +   unsigned long next; >> + >> +   if (early) { >> +       /* >> +        * We can't use pgd_page_vaddr here as it would return a linear >> +        * mapping address but it is not mapped yet, but when populating >> +        * early_pg_dir, we need the physical address and when populating >> +        * swapper_pg_dir, we need the kernel virtual address so use >> +        * pt_ops facility. >> +        */ >> +       base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); >> +   } else { >> +       base_pud = (pud_t *)pgd_page_vaddr(*pgd); >> +       if (base_pud == lm_alias(kasan_early_shadow_pud)) >> +           base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); >> +   } >> + >> +   pudp = base_pud + pud_index(vaddr); >> + >> +   do { >> +       next = pud_addr_end(vaddr, end); >> + >> +       if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { >> +           if (early) { >> +               phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); >> +               set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); >> +               continue; >> +           } else { >> +               phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); >> +               if (phys_addr) { >> +                   set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); >> +                   continue; >> +               } >> +           } >> +       } >> + >> +       kasan_populate_pmd(pudp, vaddr, next); >> +   } while (pudp++, vaddr = next, vaddr != end); >> + >> +   /* >> +    * Wait for the whole PGD to be populated before setting the PGD in >> +    * the page table, otherwise, if we did set the PGD before populating >> +    * it entirely, memblock could allocate a page at a physical address >> +    * where KASAN is not populated yet and then we'd get a page fault. >> +    */ >> +   if (!early) >> +       set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); >>  } >>  +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ?   \ >> +               (uintptr_t)kasan_early_shadow_pud : \ >> +               (uintptr_t)kasan_early_shadow_pmd) >> +#define kasan_populate_pgd_next(pgdp, vaddr, next, early)           \ >> +       (pgtable_l4_enabled ?                       \ >> +           kasan_populate_pud(pgdp, vaddr, next, early) :       \ >> +           kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) >> + >>  static void __init kasan_populate_pgd(pgd_t *pgdp, >>                        unsigned long vaddr, unsigned long end, >>                        bool early) >> @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, >>              } >>          } >>  -       kasan_populate_pmd(pgdp, vaddr, next); >> +       kasan_populate_pgd_next(pgdp, vaddr, next, early); >>      } while (pgdp++, vaddr = next, vaddr != end); >>  } >>  @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) >>      memset(start, KASAN_SHADOW_INIT, end - start); >>  } >>  +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, >> +                         unsigned long vaddr, unsigned long end, >> +                         bool kasan_populate) >> +{ >> +   unsigned long next; >> +   pud_t *pudp, *base_pud; >> +   pmd_t *base_pmd; >> +   bool is_kasan_pmd; >> + >> +   base_pud = (pud_t *)pgd_page_vaddr(*pgdp); >> +   pudp = base_pud + pud_index(vaddr); >> + >> +   if (kasan_populate) >> +       memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, >> +              sizeof(pud_t) * PTRS_PER_PUD); >> + >> +   do { >> +       next = pud_addr_end(vaddr, end); >> +       is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); >> + >> +       if (is_kasan_pmd) { >> +           base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); >> +           set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); >> +       } >> +   } while (pudp++, vaddr = next, vaddr != end); >> +} >> + >>  static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) >>  { >>      unsigned long next; >>      void *p; >>      pgd_t *pgd_k = pgd_offset_k(vaddr); >> +   bool is_kasan_pgd_next; >>        do { >>          next = pgd_addr_end(vaddr, end); >> -       if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { >> +       is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == >> +                    (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); >> + >> +       if (is_kasan_pgd_next) { >>              p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); >>              set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); >>          } >> + >> +       if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) >> +           continue; >> + >> +       kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); >>      } while (pgd_k++, vaddr = next, vaddr != end); >>  } > > > @Qinglin: I can deal with sv57 kasan population if needs be as it is a bit tricky and I think it would save you quite some time :) Thanks so much for you suggestion! And I want to give it a try firstly as I am now making new Sv57 patchset :) I will ask for your help when I meet any trouble, and thanks again! Yours, Qinglin > > >>  diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c >> index 26e69788f27a..b3db5d91ed38 100644 >> --- a/drivers/firmware/efi/libstub/efi-stub.c >> +++ b/drivers/firmware/efi/libstub/efi-stub.c >> @@ -40,6 +40,8 @@ >>    #ifdef CONFIG_ARM64 >>  # define EFI_RT_VIRTUAL_LIMIT   DEFAULT_MAP_WINDOW_64 >> +#elif defined(CONFIG_RISCV) >> +# define EFI_RT_VIRTUAL_LIMIT   TASK_SIZE_MIN >>  #else >>  # define EFI_RT_VIRTUAL_LIMIT   TASK_SIZE >>  #endif
On Mon, 6 Dec 2021 11:46:51 +0100 Alexandre Ghiti <alexandre.ghiti@canonical.com> wrote: > By adding a new 4th level of page table, give the possibility to 64bit > kernel to address 2^48 bytes of virtual address: in practice, that offers > 128TB of virtual address space to userspace and allows up to 64TB of > physical memory. > > If the underlying hardware does not support sv48, we will automatically > fallback to a standard 3-level page table by folding the new PUD level into > PGDIR level. In order to detect HW capabilities at runtime, we > use SATP feature that ignores writes with an unsupported mode. > > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> > --- > arch/riscv/Kconfig | 4 +- > arch/riscv/include/asm/csr.h | 3 +- > arch/riscv/include/asm/fixmap.h | 1 + > arch/riscv/include/asm/kasan.h | 6 +- > arch/riscv/include/asm/page.h | 14 ++ > arch/riscv/include/asm/pgalloc.h | 40 +++++ > arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- > arch/riscv/include/asm/pgtable.h | 24 ++- > arch/riscv/kernel/head.S | 3 +- > arch/riscv/mm/context.c | 4 +- > arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- > arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- > drivers/firmware/efi/libstub/efi-stub.c | 2 + > 13 files changed, 514 insertions(+), 44 deletions(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index ac6c0cd9bc29..d28fe0148e13 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -150,7 +150,7 @@ config PAGE_OFFSET > hex > default 0xC0000000 if 32BIT > default 0x80000000 if 64BIT && !MMU > - default 0xffffffd800000000 if 64BIT > + default 0xffffaf8000000000 if 64BIT > > config KASAN_SHADOW_OFFSET > hex > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM > > config PGTABLE_LEVELS > int > - default 3 if 64BIT > + default 4 if 64BIT > default 2 > > config LOCKDEP_SUPPORT > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h > index 87ac65696871..3fdb971c7896 100644 > --- a/arch/riscv/include/asm/csr.h > +++ b/arch/riscv/include/asm/csr.h > @@ -40,14 +40,13 @@ > #ifndef CONFIG_64BIT > #define SATP_PPN _AC(0x003FFFFF, UL) > #define SATP_MODE_32 _AC(0x80000000, UL) > -#define SATP_MODE SATP_MODE_32 > #define SATP_ASID_BITS 9 > #define SATP_ASID_SHIFT 22 > #define SATP_ASID_MASK _AC(0x1FF, UL) > #else > #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) > #define SATP_MODE_39 _AC(0x8000000000000000, UL) > -#define SATP_MODE SATP_MODE_39 > +#define SATP_MODE_48 _AC(0x9000000000000000, UL) > #define SATP_ASID_BITS 16 > #define SATP_ASID_SHIFT 44 > #define SATP_ASID_MASK _AC(0xFFFF, UL) > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h > index 54cbf07fb4e9..58a718573ad6 100644 > --- a/arch/riscv/include/asm/fixmap.h > +++ b/arch/riscv/include/asm/fixmap.h > @@ -24,6 +24,7 @@ enum fixed_addresses { > FIX_HOLE, > FIX_PTE, > FIX_PMD, > + FIX_PUD, > FIX_TEXT_POKE1, > FIX_TEXT_POKE0, > FIX_EARLYCON_MEM_BASE, > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h > index 743e6ff57996..0b85e363e778 100644 > --- a/arch/riscv/include/asm/kasan.h > +++ b/arch/riscv/include/asm/kasan.h > @@ -28,7 +28,11 @@ > #define KASAN_SHADOW_SCALE_SHIFT 3 > > #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) > -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) > +/* > + * Depending on the size of the virtual address space, the region may not be > + * aligned on PGDIR_SIZE, so force its alignment to ease its population. > + */ > +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) > #define KASAN_SHADOW_END MODULES_LOWEST_VADDR > #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index e03559f9b35e..d089fe46f7d8 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -31,7 +31,20 @@ > * When not using MMU this corresponds to the first free page in > * physical memory (aligned on a page boundary). > */ > +#ifdef CONFIG_64BIT > +#ifdef CONFIG_MMU > +#define PAGE_OFFSET kernel_map.page_offset > +#else > +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > +#endif > +/* > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so > + * define the PAGE_OFFSET value for SV39. > + */ > +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) > +#else > #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > +#endif /* CONFIG_64BIT */ > > /* > * Half of the kernel address space (half of the entries of the page global > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; > #endif /* CONFIG_MMU */ > > struct kernel_mapping { > + unsigned long page_offset; > unsigned long virt_addr; > uintptr_t phys_addr; > uintptr_t size; > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h > index 0af6933a7100..11823004b87a 100644 > --- a/arch/riscv/include/asm/pgalloc.h > +++ b/arch/riscv/include/asm/pgalloc.h > @@ -11,6 +11,8 @@ > #include <asm/tlb.h> > > #ifdef CONFIG_MMU > +#define __HAVE_ARCH_PUD_ALLOC_ONE > +#define __HAVE_ARCH_PUD_FREE > #include <asm-generic/pgalloc.h> > > static inline void pmd_populate_kernel(struct mm_struct *mm, > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > > set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > } > + > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) > +{ > + if (pgtable_l4_enabled) { > + unsigned long pfn = virt_to_pfn(pud); > + > + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > + } > +} > + > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, > + pud_t *pud) > +{ > + if (pgtable_l4_enabled) { > + unsigned long pfn = virt_to_pfn(pud); > + > + set_p4d_safe(p4d, > + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > + } > +} > + > +#define pud_alloc_one pud_alloc_one > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) > +{ > + if (pgtable_l4_enabled) > + return __pud_alloc_one(mm, addr); > + > + return NULL; > +} > + > +#define pud_free pud_free > +static inline void pud_free(struct mm_struct *mm, pud_t *pud) > +{ > + if (pgtable_l4_enabled) > + __pud_free(mm, pud); > +} > + > +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) > #endif /* __PAGETABLE_PMD_FOLDED */ > > static inline pgd_t *pgd_alloc(struct mm_struct *mm) > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h > index 228261aa9628..bbbdd66e5e2f 100644 > --- a/arch/riscv/include/asm/pgtable-64.h > +++ b/arch/riscv/include/asm/pgtable-64.h > @@ -8,16 +8,36 @@ > > #include <linux/const.h> > > -#define PGDIR_SHIFT 30 > +extern bool pgtable_l4_enabled; > + > +#define PGDIR_SHIFT_L3 30 > +#define PGDIR_SHIFT_L4 39 > +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) > + > +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) > /* Size of region mapped by a page global directory */ > #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) > #define PGDIR_MASK (~(PGDIR_SIZE - 1)) > > +/* pud is folded into pgd in case of 3-level page table */ > +#define PUD_SHIFT 30 > +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) > +#define PUD_MASK (~(PUD_SIZE - 1)) > + > #define PMD_SHIFT 21 > /* Size of region mapped by a page middle directory */ > #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) > #define PMD_MASK (~(PMD_SIZE - 1)) > > +/* Page Upper Directory entry */ > +typedef struct { > + unsigned long pud; > +} pud_t; > + > +#define pud_val(x) ((x).pud) > +#define __pud(x) ((pud_t) { (x) }) > +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) > + > /* Page Middle Directory entry */ > typedef struct { > unsigned long pmd; > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) > set_pud(pudp, __pud(0)); > } > > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) > +{ > + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > +} > + > +static inline unsigned long _pud_pfn(pud_t pud) > +{ > + return pud_val(pud) >> _PAGE_PFN_SHIFT; > +} > + > static inline pmd_t *pud_pgtable(pud_t pud) > { > return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) > return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); > } > > +#define mm_pud_folded mm_pud_folded > +static inline bool mm_pud_folded(struct mm_struct *mm) > +{ > + if (pgtable_l4_enabled) > + return false; > + > + return true; > +} > + > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) > + > static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) > { > return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) > #define pmd_ERROR(e) \ > pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) > > +#define pud_ERROR(e) \ > + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) > + > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + *p4dp = p4d; > + else > + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); > +} > + > +static inline int p4d_none(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (p4d_val(p4d) == 0); > + > + return 0; > +} > + > +static inline int p4d_present(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (p4d_val(p4d) & _PAGE_PRESENT); > + > + return 1; > +} > + > +static inline int p4d_bad(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return !p4d_present(p4d); > + > + return 0; > +} > + > +static inline void p4d_clear(p4d_t *p4d) > +{ > + if (pgtable_l4_enabled) > + set_p4d(p4d, __p4d(0)); > +} > + > +static inline pud_t *p4d_pgtable(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > + > + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); > +} > + > +static inline struct page *p4d_page(p4d_t p4d) > +{ > + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > +} > + > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) > + > +#define pud_offset pud_offset > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) > +{ > + if (pgtable_l4_enabled) > + return p4d_pgtable(*p4d) + pud_index(address); > + > + return (pud_t *)p4d; > +} > + > #endif /* _ASM_RISCV_PGTABLE_64_H */ > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > index e1a52e22ad7e..e1c74ef4ead2 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -51,7 +51,7 @@ > * position vmemmap directly below the VMALLOC region. > */ > #ifdef CONFIG_64BIT > -#define VA_BITS 39 > +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) > #else > #define VA_BITS 32 > #endif > @@ -90,8 +90,7 @@ > > #ifndef __ASSEMBLY__ > > -/* Page Upper Directory not used in RISC-V */ > -#include <asm-generic/pgtable-nopud.h> > +#include <asm-generic/pgtable-nop4d.h> > #include <asm/page.h> > #include <asm/tlbflush.h> > #include <linux/mm_types.h> > @@ -113,6 +112,17 @@ > #define XIP_FIXUP(addr) (addr) > #endif /* CONFIG_XIP_KERNEL */ > > +struct pt_alloc_ops { > + pte_t *(*get_pte_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pte)(uintptr_t va); > +#ifndef __PAGETABLE_PMD_FOLDED > + pmd_t *(*get_pmd_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pmd)(uintptr_t va); > + pud_t *(*get_pud_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pud)(uintptr_t va); > +#endif > +}; > + > #ifdef CONFIG_MMU > /* Number of entries in the page global directory */ > #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > */ > #ifdef CONFIG_64BIT > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) > #else > -#define TASK_SIZE FIXADDR_START > +#define TASK_SIZE FIXADDR_START > +#define TASK_SIZE_MIN TASK_SIZE > #endif > > #else /* CONFIG_MMU */ > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; > #define dtb_early_va _dtb_early_va > #define dtb_early_pa _dtb_early_pa > #endif /* CONFIG_XIP_KERNEL */ > +extern u64 satp_mode; > +extern bool pgtable_l4_enabled; > > void paging_init(void); > void misc_mem_init(void); > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index 52c5ff9804c5..c3c0ed559770 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -95,7 +95,8 @@ relocate: > > /* Compute satp for kernel page tables, but don't load it yet */ > srl a2, a0, PAGE_SHIFT > - li a1, SATP_MODE > + la a1, satp_mode > + REG_L a1, 0(a1) > or a2, a2, a1 > > /* > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c > index ee3459cb6750..a7246872bd30 100644 > --- a/arch/riscv/mm/context.c > +++ b/arch/riscv/mm/context.c > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > switch_mm_fast: > csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | > ((cntx & asid_mask) << SATP_ASID_SHIFT) | > - SATP_MODE); > + satp_mode); > > if (need_flush_tlb) > local_flush_tlb_all(); > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > static void set_mm_noasid(struct mm_struct *mm) > { > /* Switch the page table and blindly nuke entire local TLB */ > - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); > + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); > local_flush_tlb_all(); > } > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 1552226fb6bd..6a19a1b1caf8 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); > #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) > #endif > > +#ifdef CONFIG_64BIT > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; > +#else > +u64 satp_mode = SATP_MODE_32; > +#endif > +EXPORT_SYMBOL(satp_mode); > + > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? > + true : false; Hi Alex, I'm not sure whether we can use static key for pgtable_l4_enabled or not. Obviously, for a specific HW platform, pgtable_l4_enabled won't change after boot, and it seems it sits hot code path, so IMHO, static key maybe suitable for it. Thanks
On Tue, Dec 7, 2021 at 11:54 AM Alexandre Ghiti <alexandre.ghiti@canonical.com> wrote: > > By adding a new 4th level of page table, give the possibility to 64bit > kernel to address 2^48 bytes of virtual address: in practice, that offers > 128TB of virtual address space to userspace and allows up to 64TB of > physical memory. > > If the underlying hardware does not support sv48, we will automatically > fallback to a standard 3-level page table by folding the new PUD level into > PGDIR level. In order to detect HW capabilities at runtime, we > use SATP feature that ignores writes with an unsupported mode. > > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> > --- > arch/riscv/Kconfig | 4 +- > arch/riscv/include/asm/csr.h | 3 +- > arch/riscv/include/asm/fixmap.h | 1 + > arch/riscv/include/asm/kasan.h | 6 +- > arch/riscv/include/asm/page.h | 14 ++ > arch/riscv/include/asm/pgalloc.h | 40 +++++ > arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- > arch/riscv/include/asm/pgtable.h | 24 ++- > arch/riscv/kernel/head.S | 3 +- > arch/riscv/mm/context.c | 4 +- > arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- > arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- > drivers/firmware/efi/libstub/efi-stub.c | 2 + > 13 files changed, 514 insertions(+), 44 deletions(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index ac6c0cd9bc29..d28fe0148e13 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -150,7 +150,7 @@ config PAGE_OFFSET > hex > default 0xC0000000 if 32BIT > default 0x80000000 if 64BIT && !MMU > - default 0xffffffd800000000 if 64BIT > + default 0xffffaf8000000000 if 64BIT > > config KASAN_SHADOW_OFFSET > hex > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM > > config PGTABLE_LEVELS > int > - default 3 if 64BIT > + default 4 if 64BIT > default 2 > > config LOCKDEP_SUPPORT > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h > index 87ac65696871..3fdb971c7896 100644 > --- a/arch/riscv/include/asm/csr.h > +++ b/arch/riscv/include/asm/csr.h > @@ -40,14 +40,13 @@ > #ifndef CONFIG_64BIT > #define SATP_PPN _AC(0x003FFFFF, UL) > #define SATP_MODE_32 _AC(0x80000000, UL) > -#define SATP_MODE SATP_MODE_32 > #define SATP_ASID_BITS 9 > #define SATP_ASID_SHIFT 22 > #define SATP_ASID_MASK _AC(0x1FF, UL) > #else > #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) > #define SATP_MODE_39 _AC(0x8000000000000000, UL) > -#define SATP_MODE SATP_MODE_39 > +#define SATP_MODE_48 _AC(0x9000000000000000, UL) > #define SATP_ASID_BITS 16 > #define SATP_ASID_SHIFT 44 > #define SATP_ASID_MASK _AC(0xFFFF, UL) > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h > index 54cbf07fb4e9..58a718573ad6 100644 > --- a/arch/riscv/include/asm/fixmap.h > +++ b/arch/riscv/include/asm/fixmap.h > @@ -24,6 +24,7 @@ enum fixed_addresses { > FIX_HOLE, > FIX_PTE, > FIX_PMD, > + FIX_PUD, > FIX_TEXT_POKE1, > FIX_TEXT_POKE0, > FIX_EARLYCON_MEM_BASE, > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h > index 743e6ff57996..0b85e363e778 100644 > --- a/arch/riscv/include/asm/kasan.h > +++ b/arch/riscv/include/asm/kasan.h > @@ -28,7 +28,11 @@ > #define KASAN_SHADOW_SCALE_SHIFT 3 > > #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) > -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) > +/* > + * Depending on the size of the virtual address space, the region may not be > + * aligned on PGDIR_SIZE, so force its alignment to ease its population. > + */ > +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) > #define KASAN_SHADOW_END MODULES_LOWEST_VADDR > #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index e03559f9b35e..d089fe46f7d8 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -31,7 +31,20 @@ > * When not using MMU this corresponds to the first free page in > * physical memory (aligned on a page boundary). > */ > +#ifdef CONFIG_64BIT > +#ifdef CONFIG_MMU > +#define PAGE_OFFSET kernel_map.page_offset > +#else > +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > +#endif > +/* > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so > + * define the PAGE_OFFSET value for SV39. > + */ > +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) > +#else > #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > +#endif /* CONFIG_64BIT */ > > /* > * Half of the kernel address space (half of the entries of the page global > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; > #endif /* CONFIG_MMU */ > > struct kernel_mapping { > + unsigned long page_offset; > unsigned long virt_addr; > uintptr_t phys_addr; > uintptr_t size; > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h > index 0af6933a7100..11823004b87a 100644 > --- a/arch/riscv/include/asm/pgalloc.h > +++ b/arch/riscv/include/asm/pgalloc.h > @@ -11,6 +11,8 @@ > #include <asm/tlb.h> > > #ifdef CONFIG_MMU > +#define __HAVE_ARCH_PUD_ALLOC_ONE > +#define __HAVE_ARCH_PUD_FREE > #include <asm-generic/pgalloc.h> > > static inline void pmd_populate_kernel(struct mm_struct *mm, > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > > set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > } > + > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) > +{ > + if (pgtable_l4_enabled) { > + unsigned long pfn = virt_to_pfn(pud); > + > + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > + } > +} > + > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, > + pud_t *pud) > +{ > + if (pgtable_l4_enabled) { > + unsigned long pfn = virt_to_pfn(pud); > + > + set_p4d_safe(p4d, > + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > + } > +} > + > +#define pud_alloc_one pud_alloc_one > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) > +{ > + if (pgtable_l4_enabled) > + return __pud_alloc_one(mm, addr); > + > + return NULL; > +} > + > +#define pud_free pud_free > +static inline void pud_free(struct mm_struct *mm, pud_t *pud) > +{ > + if (pgtable_l4_enabled) > + __pud_free(mm, pud); > +} > + > +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) > #endif /* __PAGETABLE_PMD_FOLDED */ > > static inline pgd_t *pgd_alloc(struct mm_struct *mm) > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h > index 228261aa9628..bbbdd66e5e2f 100644 > --- a/arch/riscv/include/asm/pgtable-64.h > +++ b/arch/riscv/include/asm/pgtable-64.h > @@ -8,16 +8,36 @@ > > #include <linux/const.h> > > -#define PGDIR_SHIFT 30 > +extern bool pgtable_l4_enabled; > + > +#define PGDIR_SHIFT_L3 30 > +#define PGDIR_SHIFT_L4 39 > +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) > + > +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) > /* Size of region mapped by a page global directory */ > #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) > #define PGDIR_MASK (~(PGDIR_SIZE - 1)) > > +/* pud is folded into pgd in case of 3-level page table */ > +#define PUD_SHIFT 30 > +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) > +#define PUD_MASK (~(PUD_SIZE - 1)) > + > #define PMD_SHIFT 21 > /* Size of region mapped by a page middle directory */ > #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) > #define PMD_MASK (~(PMD_SIZE - 1)) > > +/* Page Upper Directory entry */ > +typedef struct { > + unsigned long pud; > +} pud_t; > + > +#define pud_val(x) ((x).pud) > +#define __pud(x) ((pud_t) { (x) }) > +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) > + > /* Page Middle Directory entry */ > typedef struct { > unsigned long pmd; > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) > set_pud(pudp, __pud(0)); > } > > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) > +{ > + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > +} > + > +static inline unsigned long _pud_pfn(pud_t pud) > +{ > + return pud_val(pud) >> _PAGE_PFN_SHIFT; > +} > + > static inline pmd_t *pud_pgtable(pud_t pud) > { > return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) > return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); > } > > +#define mm_pud_folded mm_pud_folded > +static inline bool mm_pud_folded(struct mm_struct *mm) > +{ > + if (pgtable_l4_enabled) > + return false; > + > + return true; > +} > + > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) > + > static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) > { > return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) > #define pmd_ERROR(e) \ > pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) > > +#define pud_ERROR(e) \ > + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) > + > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + *p4dp = p4d; > + else > + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); > +} > + > +static inline int p4d_none(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (p4d_val(p4d) == 0); > + > + return 0; > +} > + > +static inline int p4d_present(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (p4d_val(p4d) & _PAGE_PRESENT); > + > + return 1; > +} > + > +static inline int p4d_bad(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return !p4d_present(p4d); > + > + return 0; > +} > + > +static inline void p4d_clear(p4d_t *p4d) > +{ > + if (pgtable_l4_enabled) > + set_p4d(p4d, __p4d(0)); > +} > + > +static inline pud_t *p4d_pgtable(p4d_t p4d) > +{ > + if (pgtable_l4_enabled) > + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > + > + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); > +} > + > +static inline struct page *p4d_page(p4d_t p4d) > +{ > + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > +} > + > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) > + > +#define pud_offset pud_offset > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) > +{ > + if (pgtable_l4_enabled) > + return p4d_pgtable(*p4d) + pud_index(address); > + > + return (pud_t *)p4d; > +} > + > #endif /* _ASM_RISCV_PGTABLE_64_H */ > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > index e1a52e22ad7e..e1c74ef4ead2 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -51,7 +51,7 @@ > * position vmemmap directly below the VMALLOC region. > */ > #ifdef CONFIG_64BIT > -#define VA_BITS 39 > +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) > #else > #define VA_BITS 32 > #endif > @@ -90,8 +90,7 @@ > > #ifndef __ASSEMBLY__ > > -/* Page Upper Directory not used in RISC-V */ > -#include <asm-generic/pgtable-nopud.h> > +#include <asm-generic/pgtable-nop4d.h> > #include <asm/page.h> > #include <asm/tlbflush.h> > #include <linux/mm_types.h> > @@ -113,6 +112,17 @@ > #define XIP_FIXUP(addr) (addr) > #endif /* CONFIG_XIP_KERNEL */ > > +struct pt_alloc_ops { > + pte_t *(*get_pte_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pte)(uintptr_t va); > +#ifndef __PAGETABLE_PMD_FOLDED > + pmd_t *(*get_pmd_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pmd)(uintptr_t va); > + pud_t *(*get_pud_virt)(phys_addr_t pa); > + phys_addr_t (*alloc_pud)(uintptr_t va); > +#endif > +}; > + > #ifdef CONFIG_MMU > /* Number of entries in the page global directory */ > #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > */ > #ifdef CONFIG_64BIT > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) > #else > -#define TASK_SIZE FIXADDR_START > +#define TASK_SIZE FIXADDR_START > +#define TASK_SIZE_MIN TASK_SIZE This is used by efi-stub.c, rv64 compat patch also need it, we reuse DEFAULT_MAP_WINDOW_64 macro. TASK_SIZE_MIN is also okay for me, I think it should be a separate patch with efi-stub midification. https://lore.kernel.org/linux-riscv/20211228143958.3409187-9-guoren@kernel.org/ I've merged your patchset with compat tree and we are testing them together totally & carefully. https://github.com/c-sky/csky-linux/tree/riscv_compat_v2_sv48_v3 Now, rv32_rootfs & 64_rootfs booting have been passed. But I would give you tested-by later after totally tested. Your patch set is very helpful, thx. ps: Could you give chance let customer choice sv48 or sv39 in dts? > #endif > > #else /* CONFIG_MMU */ > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; > #define dtb_early_va _dtb_early_va > #define dtb_early_pa _dtb_early_pa > #endif /* CONFIG_XIP_KERNEL */ > +extern u64 satp_mode; > +extern bool pgtable_l4_enabled; > > void paging_init(void); > void misc_mem_init(void); > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index 52c5ff9804c5..c3c0ed559770 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -95,7 +95,8 @@ relocate: > > /* Compute satp for kernel page tables, but don't load it yet */ > srl a2, a0, PAGE_SHIFT > - li a1, SATP_MODE > + la a1, satp_mode > + REG_L a1, 0(a1) > or a2, a2, a1 > > /* > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c > index ee3459cb6750..a7246872bd30 100644 > --- a/arch/riscv/mm/context.c > +++ b/arch/riscv/mm/context.c > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > switch_mm_fast: > csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | > ((cntx & asid_mask) << SATP_ASID_SHIFT) | > - SATP_MODE); > + satp_mode); > > if (need_flush_tlb) > local_flush_tlb_all(); > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > static void set_mm_noasid(struct mm_struct *mm) > { > /* Switch the page table and blindly nuke entire local TLB */ > - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); > + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); > local_flush_tlb_all(); > } > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 1552226fb6bd..6a19a1b1caf8 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); > #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) > #endif > > +#ifdef CONFIG_64BIT > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; > +#else > +u64 satp_mode = SATP_MODE_32; > +#endif > +EXPORT_SYMBOL(satp_mode); > + > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? > + true : false; > +EXPORT_SYMBOL(pgtable_l4_enabled); > + > phys_addr_t phys_ram_base __ro_after_init; > EXPORT_SYMBOL(phys_ram_base); > > @@ -53,15 +64,6 @@ extern char _start[]; > void *_dtb_early_va __initdata; > uintptr_t _dtb_early_pa __initdata; > > -struct pt_alloc_ops { > - pte_t *(*get_pte_virt)(phys_addr_t pa); > - phys_addr_t (*alloc_pte)(uintptr_t va); > -#ifndef __PAGETABLE_PMD_FOLDED > - pmd_t *(*get_pmd_virt)(phys_addr_t pa); > - phys_addr_t (*alloc_pmd)(uintptr_t va); > -#endif > -}; > - > static phys_addr_t dma32_phys_limit __initdata; > > static void __init zone_sizes_init(void) > @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) > } > > #ifdef CONFIG_MMU > -static struct pt_alloc_ops _pt_ops __initdata; > +struct pt_alloc_ops _pt_ops __initdata; > > #ifdef CONFIG_XIP_KERNEL > #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) > @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; > static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; > > pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); > +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); > static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); > > #ifdef CONFIG_XIP_KERNEL > @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); > #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) > #endif /* CONFIG_XIP_KERNEL */ > > +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; > +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; > +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); > + > +#ifdef CONFIG_XIP_KERNEL > +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) > +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) > +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) > +#endif /* CONFIG_XIP_KERNEL */ > + > static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) > { > /* Before MMU is enabled */ > @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) > > static phys_addr_t __init alloc_pmd_early(uintptr_t va) > { > - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); > + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); > > return (uintptr_t)early_pmd; > } > @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, > create_pte_mapping(ptep, va, pa, sz, prot); > } > > -#define pgd_next_t pmd_t > -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) > -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) > +static pud_t *__init get_pud_virt_early(phys_addr_t pa) > +{ > + return (pud_t *)((uintptr_t)pa); > +} > + > +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) > +{ > + clear_fixmap(FIX_PUD); > + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); > +} > + > +static pud_t *__init get_pud_virt_late(phys_addr_t pa) > +{ > + return (pud_t *)__va(pa); > +} > + > +static phys_addr_t __init alloc_pud_early(uintptr_t va) > +{ > + /* Only one PUD is available for early mapping */ > + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); > + > + return (uintptr_t)early_pud; > +} > + > +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) > +{ > + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); > +} > + > +static phys_addr_t alloc_pud_late(uintptr_t va) > +{ > + unsigned long vaddr; > + > + vaddr = __get_free_page(GFP_KERNEL); > + BUG_ON(!vaddr); > + return __pa(vaddr); > +} > + > +static void __init create_pud_mapping(pud_t *pudp, > + uintptr_t va, phys_addr_t pa, > + phys_addr_t sz, pgprot_t prot) > +{ > + pmd_t *nextp; > + phys_addr_t next_phys; > + uintptr_t pud_index = pud_index(va); > + > + if (sz == PUD_SIZE) { > + if (pud_val(pudp[pud_index]) == 0) > + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); > + return; > + } > + > + if (pud_val(pudp[pud_index]) == 0) { > + next_phys = pt_ops.alloc_pmd(va); > + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); > + nextp = pt_ops.get_pmd_virt(next_phys); > + memset(nextp, 0, PAGE_SIZE); > + } else { > + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); > + nextp = pt_ops.get_pmd_virt(next_phys); > + } > + > + create_pmd_mapping(nextp, va, pa, sz, prot); > +} > + > +#define pgd_next_t pud_t > +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ > + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) > +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ > + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ > - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) > -#define fixmap_pgd_next fixmap_pmd > + (pgtable_l4_enabled ? \ > + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ > + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) > +#define fixmap_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) > +#define trampoline_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) > +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) > #else > #define pgd_next_t pte_t > #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) > #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ > create_pte_mapping(__nextp, __va, __pa, __sz, __prot) > -#define fixmap_pgd_next fixmap_pte > +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) > +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) > +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) > #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) > -#endif > +#endif /* __PAGETABLE_PMD_FOLDED */ > > void __init create_pgd_mapping(pgd_t *pgdp, > uintptr_t va, phys_addr_t pa, > @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) > } > #endif /* CONFIG_STRICT_KERNEL_RWX */ > > +#ifdef CONFIG_64BIT > +static void __init disable_pgtable_l4(void) > +{ > + pgtable_l4_enabled = false; > + kernel_map.page_offset = PAGE_OFFSET_L3; > + satp_mode = SATP_MODE_39; > +} > + > +/* > + * There is a simple way to determine if 4-level is supported by the > + * underlying hardware: establish 1:1 mapping in 4-level page table mode > + * then read SATP to see if the configuration was taken into account > + * meaning sv48 is supported. > + */ > +static __init void set_satp_mode(void) > +{ > + u64 identity_satp, hw_satp; > + uintptr_t set_satp_mode_pmd; > + > + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; > + create_pgd_mapping(early_pg_dir, > + set_satp_mode_pmd, (uintptr_t)early_pud, > + PGDIR_SIZE, PAGE_TABLE); > + create_pud_mapping(early_pud, > + set_satp_mode_pmd, (uintptr_t)early_pmd, > + PUD_SIZE, PAGE_TABLE); > + /* Handle the case where set_satp_mode straddles 2 PMDs */ > + create_pmd_mapping(early_pmd, > + set_satp_mode_pmd, set_satp_mode_pmd, > + PMD_SIZE, PAGE_KERNEL_EXEC); > + create_pmd_mapping(early_pmd, > + set_satp_mode_pmd + PMD_SIZE, > + set_satp_mode_pmd + PMD_SIZE, > + PMD_SIZE, PAGE_KERNEL_EXEC); > + > + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; > + > + local_flush_tlb_all(); > + csr_write(CSR_SATP, identity_satp); > + hw_satp = csr_swap(CSR_SATP, 0ULL); > + local_flush_tlb_all(); > + > + if (hw_satp != identity_satp) > + disable_pgtable_l4(); > + > + memset(early_pg_dir, 0, PAGE_SIZE); > + memset(early_pud, 0, PAGE_SIZE); > + memset(early_pmd, 0, PAGE_SIZE); > +} > +#endif > + > /* > * setup_vm() is called from head.S with MMU-off. > * > @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) > uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); > > create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, > - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, > + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, > PGDIR_SIZE, > IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); > > + if (pgtable_l4_enabled) { > + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, > + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); > + } > + > if (IS_ENABLED(CONFIG_64BIT)) { > create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, > pa, PMD_SIZE, PAGE_KERNEL); > @@ -593,6 +738,8 @@ void pt_ops_set_early(void) > #ifndef __PAGETABLE_PMD_FOLDED > pt_ops.alloc_pmd = alloc_pmd_early; > pt_ops.get_pmd_virt = get_pmd_virt_early; > + pt_ops.alloc_pud = alloc_pud_early; > + pt_ops.get_pud_virt = get_pud_virt_early; > #endif > } > > @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void) > #ifndef __PAGETABLE_PMD_FOLDED > pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); > pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); > + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); > + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); > #endif > } > > @@ -625,6 +774,8 @@ void pt_ops_set_late(void) > #ifndef __PAGETABLE_PMD_FOLDED > pt_ops.alloc_pmd = alloc_pmd_late; > pt_ops.get_pmd_virt = get_pmd_virt_late; > + pt_ops.alloc_pud = alloc_pud_late; > + pt_ops.get_pud_virt = get_pud_virt_late; > #endif > } > > @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; > > kernel_map.virt_addr = KERNEL_LINK_ADDR; > + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); > > #ifdef CONFIG_XIP_KERNEL > kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; > @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > kernel_map.phys_addr = (uintptr_t)(&_start); > kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; > #endif > + > +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) > + set_satp_mode(); > +#endif > + > kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; > kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; > > @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > /* Setup early PGD for fixmap */ > create_pgd_mapping(early_pg_dir, FIXADDR_START, > - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); > + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); > > #ifndef __PAGETABLE_PMD_FOLDED > - /* Setup fixmap PMD */ > + /* Setup fixmap PUD and PMD */ > + if (pgtable_l4_enabled) > + create_pud_mapping(fixmap_pud, FIXADDR_START, > + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); > create_pmd_mapping(fixmap_pmd, FIXADDR_START, > (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); > /* Setup trampoline PGD and PMD */ > create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, > - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); > + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); > + if (pgtable_l4_enabled) > + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, > + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); > #ifdef CONFIG_XIP_KERNEL > create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, > kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); > @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap > * range can not span multiple pmds. > */ > - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) > + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) > != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); > > #ifndef __PAGETABLE_PMD_FOLDED > @@ -783,9 +946,10 @@ static void __init setup_vm_final(void) > /* Clear fixmap PTE and PMD mappings */ > clear_fixmap(FIX_PTE); > clear_fixmap(FIX_PMD); > + clear_fixmap(FIX_PUD); > > /* Move to swapper page table */ > - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); > + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); > local_flush_tlb_all(); > > pt_ops_set_late(); > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c > index 1434a0225140..993f50571a3b 100644 > --- a/arch/riscv/mm/kasan_init.c > +++ b/arch/riscv/mm/kasan_init.c > @@ -11,7 +11,29 @@ > #include <asm/fixmap.h> > #include <asm/pgalloc.h> > > +/* > + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 > + * which is right before the kernel. > + * > + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate > + * the page global directory with kasan_early_shadow_pmd. > + * > + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping > + * must be divided as follows: > + * - the first PGD entry, although incomplete, is populated with > + * kasan_early_shadow_pud/p4d > + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d > + * - the last PGD entry is shared with the kernel mapping so populated at the > + * lower levels pud/p4d > + * > + * In addition, when shallow populating a kasan region (for example vmalloc), > + * this region may also not be aligned on PGDIR size, so we must go down to the > + * pud level too. > + */ > + > extern pgd_t early_pg_dir[PTRS_PER_PGD]; > +extern struct pt_alloc_ops _pt_ops __initdata; > +#define pt_ops _pt_ops > > static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) > { > @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned > set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); > } > > -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) > +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) > { > phys_addr_t phys_addr; > pmd_t *pmdp, *base_pmd; > unsigned long next; > > - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); > - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) > + if (pud_none(*pud)) { > base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); > + } else { > + base_pmd = (pmd_t *)pud_pgtable(*pud); > + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) > + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); > + } > > pmdp = base_pmd + pmd_index(vaddr); > > @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned > * it entirely, memblock could allocate a page at a physical address > * where KASAN is not populated yet and then we'd get a page fault. > */ > - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > +} > + > +static void __init kasan_populate_pud(pgd_t *pgd, > + unsigned long vaddr, unsigned long end, > + bool early) > +{ > + phys_addr_t phys_addr; > + pud_t *pudp, *base_pud; > + unsigned long next; > + > + if (early) { > + /* > + * We can't use pgd_page_vaddr here as it would return a linear > + * mapping address but it is not mapped yet, but when populating > + * early_pg_dir, we need the physical address and when populating > + * swapper_pg_dir, we need the kernel virtual address so use > + * pt_ops facility. > + */ > + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); > + } else { > + base_pud = (pud_t *)pgd_page_vaddr(*pgd); > + if (base_pud == lm_alias(kasan_early_shadow_pud)) > + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); > + } > + > + pudp = base_pud + pud_index(vaddr); > + > + do { > + next = pud_addr_end(vaddr, end); > + > + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { > + if (early) { > + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); > + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); > + continue; > + } else { > + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); > + if (phys_addr) { > + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); > + continue; > + } > + } > + } > + > + kasan_populate_pmd(pudp, vaddr, next); > + } while (pudp++, vaddr = next, vaddr != end); > + > + /* > + * Wait for the whole PGD to be populated before setting the PGD in > + * the page table, otherwise, if we did set the PGD before populating > + * it entirely, memblock could allocate a page at a physical address > + * where KASAN is not populated yet and then we'd get a page fault. > + */ > + if (!early) > + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); > } > > +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ > + (uintptr_t)kasan_early_shadow_pud : \ > + (uintptr_t)kasan_early_shadow_pmd) > +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ > + (pgtable_l4_enabled ? \ > + kasan_populate_pud(pgdp, vaddr, next, early) : \ > + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) > + > static void __init kasan_populate_pgd(pgd_t *pgdp, > unsigned long vaddr, unsigned long end, > bool early) > @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, > } > } > > - kasan_populate_pmd(pgdp, vaddr, next); > + kasan_populate_pgd_next(pgdp, vaddr, next, early); > } while (pgdp++, vaddr = next, vaddr != end); > } > > @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) > memset(start, KASAN_SHADOW_INIT, end - start); > } > > +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, > + unsigned long vaddr, unsigned long end, > + bool kasan_populate) > +{ > + unsigned long next; > + pud_t *pudp, *base_pud; > + pmd_t *base_pmd; > + bool is_kasan_pmd; > + > + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); > + pudp = base_pud + pud_index(vaddr); > + > + if (kasan_populate) > + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, > + sizeof(pud_t) * PTRS_PER_PUD); > + > + do { > + next = pud_addr_end(vaddr, end); > + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); > + > + if (is_kasan_pmd) { > + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); > + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > + } > + } while (pudp++, vaddr = next, vaddr != end); > +} > + > static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) > { > unsigned long next; > void *p; > pgd_t *pgd_k = pgd_offset_k(vaddr); > + bool is_kasan_pgd_next; > > do { > next = pgd_addr_end(vaddr, end); > - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { > + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == > + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); > + > + if (is_kasan_pgd_next) { > p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); > set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); > } > + > + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) > + continue; > + > + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); > } while (pgd_k++, vaddr = next, vaddr != end); > } > > diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c > index 26e69788f27a..b3db5d91ed38 100644 > --- a/drivers/firmware/efi/libstub/efi-stub.c > +++ b/drivers/firmware/efi/libstub/efi-stub.c > @@ -40,6 +40,8 @@ > > #ifdef CONFIG_ARM64 > # define EFI_RT_VIRTUAL_LIMIT DEFAULT_MAP_WINDOW_64 > +#elif defined(CONFIG_RISCV) > +# define EFI_RT_VIRTUAL_LIMIT TASK_SIZE_MIN > #else > # define EFI_RT_VIRTUAL_LIMIT TASK_SIZE > #endif > -- > 2.32.0 > -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/
Hi Guo, On Wed, Dec 29, 2021 at 4:42 AM Guo Ren <guoren@kernel.org> wrote: > > On Tue, Dec 7, 2021 at 11:54 AM Alexandre Ghiti > <alexandre.ghiti@canonical.com> wrote: > > > > By adding a new 4th level of page table, give the possibility to 64bit > > kernel to address 2^48 bytes of virtual address: in practice, that offers > > 128TB of virtual address space to userspace and allows up to 64TB of > > physical memory. > > > > If the underlying hardware does not support sv48, we will automatically > > fallback to a standard 3-level page table by folding the new PUD level into > > PGDIR level. In order to detect HW capabilities at runtime, we > > use SATP feature that ignores writes with an unsupported mode. > > > > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> > > --- > > arch/riscv/Kconfig | 4 +- > > arch/riscv/include/asm/csr.h | 3 +- > > arch/riscv/include/asm/fixmap.h | 1 + > > arch/riscv/include/asm/kasan.h | 6 +- > > arch/riscv/include/asm/page.h | 14 ++ > > arch/riscv/include/asm/pgalloc.h | 40 +++++ > > arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- > > arch/riscv/include/asm/pgtable.h | 24 ++- > > arch/riscv/kernel/head.S | 3 +- > > arch/riscv/mm/context.c | 4 +- > > arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- > > arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- > > drivers/firmware/efi/libstub/efi-stub.c | 2 + > > 13 files changed, 514 insertions(+), 44 deletions(-) > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > > index ac6c0cd9bc29..d28fe0148e13 100644 > > --- a/arch/riscv/Kconfig > > +++ b/arch/riscv/Kconfig > > @@ -150,7 +150,7 @@ config PAGE_OFFSET > > hex > > default 0xC0000000 if 32BIT > > default 0x80000000 if 64BIT && !MMU > > - default 0xffffffd800000000 if 64BIT > > + default 0xffffaf8000000000 if 64BIT > > > > config KASAN_SHADOW_OFFSET > > hex > > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM > > > > config PGTABLE_LEVELS > > int > > - default 3 if 64BIT > > + default 4 if 64BIT > > default 2 > > > > config LOCKDEP_SUPPORT > > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h > > index 87ac65696871..3fdb971c7896 100644 > > --- a/arch/riscv/include/asm/csr.h > > +++ b/arch/riscv/include/asm/csr.h > > @@ -40,14 +40,13 @@ > > #ifndef CONFIG_64BIT > > #define SATP_PPN _AC(0x003FFFFF, UL) > > #define SATP_MODE_32 _AC(0x80000000, UL) > > -#define SATP_MODE SATP_MODE_32 > > #define SATP_ASID_BITS 9 > > #define SATP_ASID_SHIFT 22 > > #define SATP_ASID_MASK _AC(0x1FF, UL) > > #else > > #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) > > #define SATP_MODE_39 _AC(0x8000000000000000, UL) > > -#define SATP_MODE SATP_MODE_39 > > +#define SATP_MODE_48 _AC(0x9000000000000000, UL) > > #define SATP_ASID_BITS 16 > > #define SATP_ASID_SHIFT 44 > > #define SATP_ASID_MASK _AC(0xFFFF, UL) > > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h > > index 54cbf07fb4e9..58a718573ad6 100644 > > --- a/arch/riscv/include/asm/fixmap.h > > +++ b/arch/riscv/include/asm/fixmap.h > > @@ -24,6 +24,7 @@ enum fixed_addresses { > > FIX_HOLE, > > FIX_PTE, > > FIX_PMD, > > + FIX_PUD, > > FIX_TEXT_POKE1, > > FIX_TEXT_POKE0, > > FIX_EARLYCON_MEM_BASE, > > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h > > index 743e6ff57996..0b85e363e778 100644 > > --- a/arch/riscv/include/asm/kasan.h > > +++ b/arch/riscv/include/asm/kasan.h > > @@ -28,7 +28,11 @@ > > #define KASAN_SHADOW_SCALE_SHIFT 3 > > > > #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) > > -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) > > +/* > > + * Depending on the size of the virtual address space, the region may not be > > + * aligned on PGDIR_SIZE, so force its alignment to ease its population. > > + */ > > +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) > > #define KASAN_SHADOW_END MODULES_LOWEST_VADDR > > #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) > > > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > > index e03559f9b35e..d089fe46f7d8 100644 > > --- a/arch/riscv/include/asm/page.h > > +++ b/arch/riscv/include/asm/page.h > > @@ -31,7 +31,20 @@ > > * When not using MMU this corresponds to the first free page in > > * physical memory (aligned on a page boundary). > > */ > > +#ifdef CONFIG_64BIT > > +#ifdef CONFIG_MMU > > +#define PAGE_OFFSET kernel_map.page_offset > > +#else > > +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > > +#endif > > +/* > > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so > > + * define the PAGE_OFFSET value for SV39. > > + */ > > +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) > > +#else > > #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > > +#endif /* CONFIG_64BIT */ > > > > /* > > * Half of the kernel address space (half of the entries of the page global > > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; > > #endif /* CONFIG_MMU */ > > > > struct kernel_mapping { > > + unsigned long page_offset; > > unsigned long virt_addr; > > uintptr_t phys_addr; > > uintptr_t size; > > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h > > index 0af6933a7100..11823004b87a 100644 > > --- a/arch/riscv/include/asm/pgalloc.h > > +++ b/arch/riscv/include/asm/pgalloc.h > > @@ -11,6 +11,8 @@ > > #include <asm/tlb.h> > > > > #ifdef CONFIG_MMU > > +#define __HAVE_ARCH_PUD_ALLOC_ONE > > +#define __HAVE_ARCH_PUD_FREE > > #include <asm-generic/pgalloc.h> > > > > static inline void pmd_populate_kernel(struct mm_struct *mm, > > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > > > > set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > > } > > + > > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) > > +{ > > + if (pgtable_l4_enabled) { > > + unsigned long pfn = virt_to_pfn(pud); > > + > > + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > > + } > > +} > > + > > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, > > + pud_t *pud) > > +{ > > + if (pgtable_l4_enabled) { > > + unsigned long pfn = virt_to_pfn(pud); > > + > > + set_p4d_safe(p4d, > > + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > > + } > > +} > > + > > +#define pud_alloc_one pud_alloc_one > > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) > > +{ > > + if (pgtable_l4_enabled) > > + return __pud_alloc_one(mm, addr); > > + > > + return NULL; > > +} > > + > > +#define pud_free pud_free > > +static inline void pud_free(struct mm_struct *mm, pud_t *pud) > > +{ > > + if (pgtable_l4_enabled) > > + __pud_free(mm, pud); > > +} > > + > > +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) > > #endif /* __PAGETABLE_PMD_FOLDED */ > > > > static inline pgd_t *pgd_alloc(struct mm_struct *mm) > > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h > > index 228261aa9628..bbbdd66e5e2f 100644 > > --- a/arch/riscv/include/asm/pgtable-64.h > > +++ b/arch/riscv/include/asm/pgtable-64.h > > @@ -8,16 +8,36 @@ > > > > #include <linux/const.h> > > > > -#define PGDIR_SHIFT 30 > > +extern bool pgtable_l4_enabled; > > + > > +#define PGDIR_SHIFT_L3 30 > > +#define PGDIR_SHIFT_L4 39 > > +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) > > + > > +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) > > /* Size of region mapped by a page global directory */ > > #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) > > #define PGDIR_MASK (~(PGDIR_SIZE - 1)) > > > > +/* pud is folded into pgd in case of 3-level page table */ > > +#define PUD_SHIFT 30 > > +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) > > +#define PUD_MASK (~(PUD_SIZE - 1)) > > + > > #define PMD_SHIFT 21 > > /* Size of region mapped by a page middle directory */ > > #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) > > #define PMD_MASK (~(PMD_SIZE - 1)) > > > > +/* Page Upper Directory entry */ > > +typedef struct { > > + unsigned long pud; > > +} pud_t; > > + > > +#define pud_val(x) ((x).pud) > > +#define __pud(x) ((pud_t) { (x) }) > > +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) > > + > > /* Page Middle Directory entry */ > > typedef struct { > > unsigned long pmd; > > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) > > set_pud(pudp, __pud(0)); > > } > > > > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) > > +{ > > + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > > +} > > + > > +static inline unsigned long _pud_pfn(pud_t pud) > > +{ > > + return pud_val(pud) >> _PAGE_PFN_SHIFT; > > +} > > + > > static inline pmd_t *pud_pgtable(pud_t pud) > > { > > return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); > > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) > > return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); > > } > > > > +#define mm_pud_folded mm_pud_folded > > +static inline bool mm_pud_folded(struct mm_struct *mm) > > +{ > > + if (pgtable_l4_enabled) > > + return false; > > + > > + return true; > > +} > > + > > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) > > + > > static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) > > { > > return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) > > #define pmd_ERROR(e) \ > > pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) > > > > +#define pud_ERROR(e) \ > > + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) > > + > > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + *p4dp = p4d; > > + else > > + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); > > +} > > + > > +static inline int p4d_none(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return (p4d_val(p4d) == 0); > > + > > + return 0; > > +} > > + > > +static inline int p4d_present(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return (p4d_val(p4d) & _PAGE_PRESENT); > > + > > + return 1; > > +} > > + > > +static inline int p4d_bad(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return !p4d_present(p4d); > > + > > + return 0; > > +} > > + > > +static inline void p4d_clear(p4d_t *p4d) > > +{ > > + if (pgtable_l4_enabled) > > + set_p4d(p4d, __p4d(0)); > > +} > > + > > +static inline pud_t *p4d_pgtable(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > > + > > + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); > > +} > > + > > +static inline struct page *p4d_page(p4d_t p4d) > > +{ > > + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > > +} > > + > > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) > > + > > +#define pud_offset pud_offset > > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) > > +{ > > + if (pgtable_l4_enabled) > > + return p4d_pgtable(*p4d) + pud_index(address); > > + > > + return (pud_t *)p4d; > > +} > > + > > #endif /* _ASM_RISCV_PGTABLE_64_H */ > > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > > index e1a52e22ad7e..e1c74ef4ead2 100644 > > --- a/arch/riscv/include/asm/pgtable.h > > +++ b/arch/riscv/include/asm/pgtable.h > > @@ -51,7 +51,7 @@ > > * position vmemmap directly below the VMALLOC region. > > */ > > #ifdef CONFIG_64BIT > > -#define VA_BITS 39 > > +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) > > #else > > #define VA_BITS 32 > > #endif > > @@ -90,8 +90,7 @@ > > > > #ifndef __ASSEMBLY__ > > > > -/* Page Upper Directory not used in RISC-V */ > > -#include <asm-generic/pgtable-nopud.h> > > +#include <asm-generic/pgtable-nop4d.h> > > #include <asm/page.h> > > #include <asm/tlbflush.h> > > #include <linux/mm_types.h> > > @@ -113,6 +112,17 @@ > > #define XIP_FIXUP(addr) (addr) > > #endif /* CONFIG_XIP_KERNEL */ > > > > +struct pt_alloc_ops { > > + pte_t *(*get_pte_virt)(phys_addr_t pa); > > + phys_addr_t (*alloc_pte)(uintptr_t va); > > +#ifndef __PAGETABLE_PMD_FOLDED > > + pmd_t *(*get_pmd_virt)(phys_addr_t pa); > > + phys_addr_t (*alloc_pmd)(uintptr_t va); > > + pud_t *(*get_pud_virt)(phys_addr_t pa); > > + phys_addr_t (*alloc_pud)(uintptr_t va); > > +#endif > > +}; > > + > > #ifdef CONFIG_MMU > > /* Number of entries in the page global directory */ > > #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) > > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, > > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > > */ > > #ifdef CONFIG_64BIT > > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > > +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > > +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) > > #else > > -#define TASK_SIZE FIXADDR_START > > +#define TASK_SIZE FIXADDR_START > > +#define TASK_SIZE_MIN TASK_SIZE > This is used by efi-stub.c, rv64 compat patch also need it, we reuse > DEFAULT_MAP_WINDOW_64 macro. > > TASK_SIZE_MIN is also okay for me, I think it should be a separate > patch with efi-stub midification. IMO, TASK_SIZE_MIN is more explicit than DEFAULT_MAP_WINDOW_64. I'll split this change in the next series. > https://lore.kernel.org/linux-riscv/20211228143958.3409187-9-guoren@kernel.org/ > > I've merged your patchset with compat tree and we are testing them > together totally & carefully. > https://github.com/c-sky/csky-linux/tree/riscv_compat_v2_sv48_v3 > > Now, rv32_rootfs & 64_rootfs booting have been passed. But I would > give you tested-by later after totally tested. Your patch set is very > helpful, thx. Thanks a lot, that will help move forward ;) > > ps: Could you give chance let customer choice sv48 or sv39 in dts? > This is already implemented in patch 13. Thanks! Alex > > > #endif > > > > #else /* CONFIG_MMU */ > > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; > > #define dtb_early_va _dtb_early_va > > #define dtb_early_pa _dtb_early_pa > > #endif /* CONFIG_XIP_KERNEL */ > > +extern u64 satp_mode; > > +extern bool pgtable_l4_enabled; > > > > void paging_init(void); > > void misc_mem_init(void); > > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > > index 52c5ff9804c5..c3c0ed559770 100644 > > --- a/arch/riscv/kernel/head.S > > +++ b/arch/riscv/kernel/head.S > > @@ -95,7 +95,8 @@ relocate: > > > > /* Compute satp for kernel page tables, but don't load it yet */ > > srl a2, a0, PAGE_SHIFT > > - li a1, SATP_MODE > > + la a1, satp_mode > > + REG_L a1, 0(a1) > > or a2, a2, a1 > > > > /* > > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c > > index ee3459cb6750..a7246872bd30 100644 > > --- a/arch/riscv/mm/context.c > > +++ b/arch/riscv/mm/context.c > > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > > switch_mm_fast: > > csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | > > ((cntx & asid_mask) << SATP_ASID_SHIFT) | > > - SATP_MODE); > > + satp_mode); > > > > if (need_flush_tlb) > > local_flush_tlb_all(); > > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > > static void set_mm_noasid(struct mm_struct *mm) > > { > > /* Switch the page table and blindly nuke entire local TLB */ > > - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); > > + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); > > local_flush_tlb_all(); > > } > > > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > > index 1552226fb6bd..6a19a1b1caf8 100644 > > --- a/arch/riscv/mm/init.c > > +++ b/arch/riscv/mm/init.c > > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); > > #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) > > #endif > > > > +#ifdef CONFIG_64BIT > > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; > > +#else > > +u64 satp_mode = SATP_MODE_32; > > +#endif > > +EXPORT_SYMBOL(satp_mode); > > + > > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? > > + true : false; > > +EXPORT_SYMBOL(pgtable_l4_enabled); > > + > > phys_addr_t phys_ram_base __ro_after_init; > > EXPORT_SYMBOL(phys_ram_base); > > > > @@ -53,15 +64,6 @@ extern char _start[]; > > void *_dtb_early_va __initdata; > > uintptr_t _dtb_early_pa __initdata; > > > > -struct pt_alloc_ops { > > - pte_t *(*get_pte_virt)(phys_addr_t pa); > > - phys_addr_t (*alloc_pte)(uintptr_t va); > > -#ifndef __PAGETABLE_PMD_FOLDED > > - pmd_t *(*get_pmd_virt)(phys_addr_t pa); > > - phys_addr_t (*alloc_pmd)(uintptr_t va); > > -#endif > > -}; > > - > > static phys_addr_t dma32_phys_limit __initdata; > > > > static void __init zone_sizes_init(void) > > @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) > > } > > > > #ifdef CONFIG_MMU > > -static struct pt_alloc_ops _pt_ops __initdata; > > +struct pt_alloc_ops _pt_ops __initdata; > > > > #ifdef CONFIG_XIP_KERNEL > > #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) > > @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; > > static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; > > > > pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); > > +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); > > static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); > > > > #ifdef CONFIG_XIP_KERNEL > > @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); > > #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) > > #endif /* CONFIG_XIP_KERNEL */ > > > > +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; > > +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; > > +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); > > + > > +#ifdef CONFIG_XIP_KERNEL > > +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) > > +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) > > +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) > > +#endif /* CONFIG_XIP_KERNEL */ > > + > > static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) > > { > > /* Before MMU is enabled */ > > @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) > > > > static phys_addr_t __init alloc_pmd_early(uintptr_t va) > > { > > - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); > > + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); > > > > return (uintptr_t)early_pmd; > > } > > @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, > > create_pte_mapping(ptep, va, pa, sz, prot); > > } > > > > -#define pgd_next_t pmd_t > > -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) > > -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) > > +static pud_t *__init get_pud_virt_early(phys_addr_t pa) > > +{ > > + return (pud_t *)((uintptr_t)pa); > > +} > > + > > +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) > > +{ > > + clear_fixmap(FIX_PUD); > > + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); > > +} > > + > > +static pud_t *__init get_pud_virt_late(phys_addr_t pa) > > +{ > > + return (pud_t *)__va(pa); > > +} > > + > > +static phys_addr_t __init alloc_pud_early(uintptr_t va) > > +{ > > + /* Only one PUD is available for early mapping */ > > + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); > > + > > + return (uintptr_t)early_pud; > > +} > > + > > +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) > > +{ > > + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); > > +} > > + > > +static phys_addr_t alloc_pud_late(uintptr_t va) > > +{ > > + unsigned long vaddr; > > + > > + vaddr = __get_free_page(GFP_KERNEL); > > + BUG_ON(!vaddr); > > + return __pa(vaddr); > > +} > > + > > +static void __init create_pud_mapping(pud_t *pudp, > > + uintptr_t va, phys_addr_t pa, > > + phys_addr_t sz, pgprot_t prot) > > +{ > > + pmd_t *nextp; > > + phys_addr_t next_phys; > > + uintptr_t pud_index = pud_index(va); > > + > > + if (sz == PUD_SIZE) { > > + if (pud_val(pudp[pud_index]) == 0) > > + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); > > + return; > > + } > > + > > + if (pud_val(pudp[pud_index]) == 0) { > > + next_phys = pt_ops.alloc_pmd(va); > > + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); > > + nextp = pt_ops.get_pmd_virt(next_phys); > > + memset(nextp, 0, PAGE_SIZE); > > + } else { > > + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); > > + nextp = pt_ops.get_pmd_virt(next_phys); > > + } > > + > > + create_pmd_mapping(nextp, va, pa, sz, prot); > > +} > > + > > +#define pgd_next_t pud_t > > +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ > > + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) > > +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ > > + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) > > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ > > - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) > > -#define fixmap_pgd_next fixmap_pmd > > + (pgtable_l4_enabled ? \ > > + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ > > + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) > > +#define fixmap_pgd_next (pgtable_l4_enabled ? \ > > + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) > > +#define trampoline_pgd_next (pgtable_l4_enabled ? \ > > + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) > > +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ > > + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) > > #else > > #define pgd_next_t pte_t > > #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) > > #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) > > #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ > > create_pte_mapping(__nextp, __va, __pa, __sz, __prot) > > -#define fixmap_pgd_next fixmap_pte > > +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) > > +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) > > +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) > > #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) > > -#endif > > +#endif /* __PAGETABLE_PMD_FOLDED */ > > > > void __init create_pgd_mapping(pgd_t *pgdp, > > uintptr_t va, phys_addr_t pa, > > @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) > > } > > #endif /* CONFIG_STRICT_KERNEL_RWX */ > > > > +#ifdef CONFIG_64BIT > > +static void __init disable_pgtable_l4(void) > > +{ > > + pgtable_l4_enabled = false; > > + kernel_map.page_offset = PAGE_OFFSET_L3; > > + satp_mode = SATP_MODE_39; > > +} > > + > > +/* > > + * There is a simple way to determine if 4-level is supported by the > > + * underlying hardware: establish 1:1 mapping in 4-level page table mode > > + * then read SATP to see if the configuration was taken into account > > + * meaning sv48 is supported. > > + */ > > +static __init void set_satp_mode(void) > > +{ > > + u64 identity_satp, hw_satp; > > + uintptr_t set_satp_mode_pmd; > > + > > + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; > > + create_pgd_mapping(early_pg_dir, > > + set_satp_mode_pmd, (uintptr_t)early_pud, > > + PGDIR_SIZE, PAGE_TABLE); > > + create_pud_mapping(early_pud, > > + set_satp_mode_pmd, (uintptr_t)early_pmd, > > + PUD_SIZE, PAGE_TABLE); > > + /* Handle the case where set_satp_mode straddles 2 PMDs */ > > + create_pmd_mapping(early_pmd, > > + set_satp_mode_pmd, set_satp_mode_pmd, > > + PMD_SIZE, PAGE_KERNEL_EXEC); > > + create_pmd_mapping(early_pmd, > > + set_satp_mode_pmd + PMD_SIZE, > > + set_satp_mode_pmd + PMD_SIZE, > > + PMD_SIZE, PAGE_KERNEL_EXEC); > > + > > + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; > > + > > + local_flush_tlb_all(); > > + csr_write(CSR_SATP, identity_satp); > > + hw_satp = csr_swap(CSR_SATP, 0ULL); > > + local_flush_tlb_all(); > > + > > + if (hw_satp != identity_satp) > > + disable_pgtable_l4(); > > + > > + memset(early_pg_dir, 0, PAGE_SIZE); > > + memset(early_pud, 0, PAGE_SIZE); > > + memset(early_pmd, 0, PAGE_SIZE); > > +} > > +#endif > > + > > /* > > * setup_vm() is called from head.S with MMU-off. > > * > > @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) > > uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); > > > > create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, > > - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, > > + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, > > PGDIR_SIZE, > > IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); > > > > + if (pgtable_l4_enabled) { > > + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, > > + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); > > + } > > + > > if (IS_ENABLED(CONFIG_64BIT)) { > > create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, > > pa, PMD_SIZE, PAGE_KERNEL); > > @@ -593,6 +738,8 @@ void pt_ops_set_early(void) > > #ifndef __PAGETABLE_PMD_FOLDED > > pt_ops.alloc_pmd = alloc_pmd_early; > > pt_ops.get_pmd_virt = get_pmd_virt_early; > > + pt_ops.alloc_pud = alloc_pud_early; > > + pt_ops.get_pud_virt = get_pud_virt_early; > > #endif > > } > > > > @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void) > > #ifndef __PAGETABLE_PMD_FOLDED > > pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); > > pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); > > + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); > > + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); > > #endif > > } > > > > @@ -625,6 +774,8 @@ void pt_ops_set_late(void) > > #ifndef __PAGETABLE_PMD_FOLDED > > pt_ops.alloc_pmd = alloc_pmd_late; > > pt_ops.get_pmd_virt = get_pmd_virt_late; > > + pt_ops.alloc_pud = alloc_pud_late; > > + pt_ops.get_pud_virt = get_pud_virt_late; > > #endif > > } > > > > @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; > > > > kernel_map.virt_addr = KERNEL_LINK_ADDR; > > + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); > > > > #ifdef CONFIG_XIP_KERNEL > > kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; > > @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > kernel_map.phys_addr = (uintptr_t)(&_start); > > kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; > > #endif > > + > > +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) > > + set_satp_mode(); > > +#endif > > + > > kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; > > kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; > > > > @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > > > /* Setup early PGD for fixmap */ > > create_pgd_mapping(early_pg_dir, FIXADDR_START, > > - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); > > + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); > > > > #ifndef __PAGETABLE_PMD_FOLDED > > - /* Setup fixmap PMD */ > > + /* Setup fixmap PUD and PMD */ > > + if (pgtable_l4_enabled) > > + create_pud_mapping(fixmap_pud, FIXADDR_START, > > + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); > > create_pmd_mapping(fixmap_pmd, FIXADDR_START, > > (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); > > /* Setup trampoline PGD and PMD */ > > create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, > > - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); > > + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); > > + if (pgtable_l4_enabled) > > + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, > > + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); > > #ifdef CONFIG_XIP_KERNEL > > create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, > > kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); > > @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap > > * range can not span multiple pmds. > > */ > > - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) > > + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) > > != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); > > > > #ifndef __PAGETABLE_PMD_FOLDED > > @@ -783,9 +946,10 @@ static void __init setup_vm_final(void) > > /* Clear fixmap PTE and PMD mappings */ > > clear_fixmap(FIX_PTE); > > clear_fixmap(FIX_PMD); > > + clear_fixmap(FIX_PUD); > > > > /* Move to swapper page table */ > > - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); > > + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); > > local_flush_tlb_all(); > > > > pt_ops_set_late(); > > diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c > > index 1434a0225140..993f50571a3b 100644 > > --- a/arch/riscv/mm/kasan_init.c > > +++ b/arch/riscv/mm/kasan_init.c > > @@ -11,7 +11,29 @@ > > #include <asm/fixmap.h> > > #include <asm/pgalloc.h> > > > > +/* > > + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 > > + * which is right before the kernel. > > + * > > + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate > > + * the page global directory with kasan_early_shadow_pmd. > > + * > > + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping > > + * must be divided as follows: > > + * - the first PGD entry, although incomplete, is populated with > > + * kasan_early_shadow_pud/p4d > > + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d > > + * - the last PGD entry is shared with the kernel mapping so populated at the > > + * lower levels pud/p4d > > + * > > + * In addition, when shallow populating a kasan region (for example vmalloc), > > + * this region may also not be aligned on PGDIR size, so we must go down to the > > + * pud level too. > > + */ > > + > > extern pgd_t early_pg_dir[PTRS_PER_PGD]; > > +extern struct pt_alloc_ops _pt_ops __initdata; > > +#define pt_ops _pt_ops > > > > static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) > > { > > @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned > > set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); > > } > > > > -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) > > +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) > > { > > phys_addr_t phys_addr; > > pmd_t *pmdp, *base_pmd; > > unsigned long next; > > > > - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); > > - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) > > + if (pud_none(*pud)) { > > base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); > > + } else { > > + base_pmd = (pmd_t *)pud_pgtable(*pud); > > + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) > > + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); > > + } > > > > pmdp = base_pmd + pmd_index(vaddr); > > > > @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned > > * it entirely, memblock could allocate a page at a physical address > > * where KASAN is not populated yet and then we'd get a page fault. > > */ > > - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > > + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > > +} > > + > > +static void __init kasan_populate_pud(pgd_t *pgd, > > + unsigned long vaddr, unsigned long end, > > + bool early) > > +{ > > + phys_addr_t phys_addr; > > + pud_t *pudp, *base_pud; > > + unsigned long next; > > + > > + if (early) { > > + /* > > + * We can't use pgd_page_vaddr here as it would return a linear > > + * mapping address but it is not mapped yet, but when populating > > + * early_pg_dir, we need the physical address and when populating > > + * swapper_pg_dir, we need the kernel virtual address so use > > + * pt_ops facility. > > + */ > > + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); > > + } else { > > + base_pud = (pud_t *)pgd_page_vaddr(*pgd); > > + if (base_pud == lm_alias(kasan_early_shadow_pud)) > > + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); > > + } > > + > > + pudp = base_pud + pud_index(vaddr); > > + > > + do { > > + next = pud_addr_end(vaddr, end); > > + > > + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { > > + if (early) { > > + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); > > + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); > > + continue; > > + } else { > > + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); > > + if (phys_addr) { > > + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); > > + continue; > > + } > > + } > > + } > > + > > + kasan_populate_pmd(pudp, vaddr, next); > > + } while (pudp++, vaddr = next, vaddr != end); > > + > > + /* > > + * Wait for the whole PGD to be populated before setting the PGD in > > + * the page table, otherwise, if we did set the PGD before populating > > + * it entirely, memblock could allocate a page at a physical address > > + * where KASAN is not populated yet and then we'd get a page fault. > > + */ > > + if (!early) > > + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); > > } > > > > +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ > > + (uintptr_t)kasan_early_shadow_pud : \ > > + (uintptr_t)kasan_early_shadow_pmd) > > +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ > > + (pgtable_l4_enabled ? \ > > + kasan_populate_pud(pgdp, vaddr, next, early) : \ > > + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) > > + > > static void __init kasan_populate_pgd(pgd_t *pgdp, > > unsigned long vaddr, unsigned long end, > > bool early) > > @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, > > } > > } > > > > - kasan_populate_pmd(pgdp, vaddr, next); > > + kasan_populate_pgd_next(pgdp, vaddr, next, early); > > } while (pgdp++, vaddr = next, vaddr != end); > > } > > > > @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) > > memset(start, KASAN_SHADOW_INIT, end - start); > > } > > > > +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, > > + unsigned long vaddr, unsigned long end, > > + bool kasan_populate) > > +{ > > + unsigned long next; > > + pud_t *pudp, *base_pud; > > + pmd_t *base_pmd; > > + bool is_kasan_pmd; > > + > > + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); > > + pudp = base_pud + pud_index(vaddr); > > + > > + if (kasan_populate) > > + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, > > + sizeof(pud_t) * PTRS_PER_PUD); > > + > > + do { > > + next = pud_addr_end(vaddr, end); > > + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); > > + > > + if (is_kasan_pmd) { > > + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); > > + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); > > + } > > + } while (pudp++, vaddr = next, vaddr != end); > > +} > > + > > static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) > > { > > unsigned long next; > > void *p; > > pgd_t *pgd_k = pgd_offset_k(vaddr); > > + bool is_kasan_pgd_next; > > > > do { > > next = pgd_addr_end(vaddr, end); > > - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { > > + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == > > + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); > > + > > + if (is_kasan_pgd_next) { > > p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); > > set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); > > } > > + > > + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) > > + continue; > > + > > + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); > > } while (pgd_k++, vaddr = next, vaddr != end); > > } > > > > diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c > > index 26e69788f27a..b3db5d91ed38 100644 > > --- a/drivers/firmware/efi/libstub/efi-stub.c > > +++ b/drivers/firmware/efi/libstub/efi-stub.c > > @@ -40,6 +40,8 @@ > > > > #ifdef CONFIG_ARM64 > > # define EFI_RT_VIRTUAL_LIMIT DEFAULT_MAP_WINDOW_64 > > +#elif defined(CONFIG_RISCV) > > +# define EFI_RT_VIRTUAL_LIMIT TASK_SIZE_MIN > > #else > > # define EFI_RT_VIRTUAL_LIMIT TASK_SIZE > > #endif > > -- > > 2.32.0 > > > > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/
Hi Jisheng, On Sun, Dec 26, 2021 at 10:06 AM Jisheng Zhang <jszhang3@mail.ustc.edu.cn> wrote: > > On Mon, 6 Dec 2021 11:46:51 +0100 > Alexandre Ghiti <alexandre.ghiti@canonical.com> wrote: > > > By adding a new 4th level of page table, give the possibility to 64bit > > kernel to address 2^48 bytes of virtual address: in practice, that offers > > 128TB of virtual address space to userspace and allows up to 64TB of > > physical memory. > > > > If the underlying hardware does not support sv48, we will automatically > > fallback to a standard 3-level page table by folding the new PUD level into > > PGDIR level. In order to detect HW capabilities at runtime, we > > use SATP feature that ignores writes with an unsupported mode. > > > > Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> > > --- > > arch/riscv/Kconfig | 4 +- > > arch/riscv/include/asm/csr.h | 3 +- > > arch/riscv/include/asm/fixmap.h | 1 + > > arch/riscv/include/asm/kasan.h | 6 +- > > arch/riscv/include/asm/page.h | 14 ++ > > arch/riscv/include/asm/pgalloc.h | 40 +++++ > > arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- > > arch/riscv/include/asm/pgtable.h | 24 ++- > > arch/riscv/kernel/head.S | 3 +- > > arch/riscv/mm/context.c | 4 +- > > arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- > > arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- > > drivers/firmware/efi/libstub/efi-stub.c | 2 + > > 13 files changed, 514 insertions(+), 44 deletions(-) > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > > index ac6c0cd9bc29..d28fe0148e13 100644 > > --- a/arch/riscv/Kconfig > > +++ b/arch/riscv/Kconfig > > @@ -150,7 +150,7 @@ config PAGE_OFFSET > > hex > > default 0xC0000000 if 32BIT > > default 0x80000000 if 64BIT && !MMU > > - default 0xffffffd800000000 if 64BIT > > + default 0xffffaf8000000000 if 64BIT > > > > config KASAN_SHADOW_OFFSET > > hex > > @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM > > > > config PGTABLE_LEVELS > > int > > - default 3 if 64BIT > > + default 4 if 64BIT > > default 2 > > > > config LOCKDEP_SUPPORT > > diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h > > index 87ac65696871..3fdb971c7896 100644 > > --- a/arch/riscv/include/asm/csr.h > > +++ b/arch/riscv/include/asm/csr.h > > @@ -40,14 +40,13 @@ > > #ifndef CONFIG_64BIT > > #define SATP_PPN _AC(0x003FFFFF, UL) > > #define SATP_MODE_32 _AC(0x80000000, UL) > > -#define SATP_MODE SATP_MODE_32 > > #define SATP_ASID_BITS 9 > > #define SATP_ASID_SHIFT 22 > > #define SATP_ASID_MASK _AC(0x1FF, UL) > > #else > > #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) > > #define SATP_MODE_39 _AC(0x8000000000000000, UL) > > -#define SATP_MODE SATP_MODE_39 > > +#define SATP_MODE_48 _AC(0x9000000000000000, UL) > > #define SATP_ASID_BITS 16 > > #define SATP_ASID_SHIFT 44 > > #define SATP_ASID_MASK _AC(0xFFFF, UL) > > diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h > > index 54cbf07fb4e9..58a718573ad6 100644 > > --- a/arch/riscv/include/asm/fixmap.h > > +++ b/arch/riscv/include/asm/fixmap.h > > @@ -24,6 +24,7 @@ enum fixed_addresses { > > FIX_HOLE, > > FIX_PTE, > > FIX_PMD, > > + FIX_PUD, > > FIX_TEXT_POKE1, > > FIX_TEXT_POKE0, > > FIX_EARLYCON_MEM_BASE, > > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h > > index 743e6ff57996..0b85e363e778 100644 > > --- a/arch/riscv/include/asm/kasan.h > > +++ b/arch/riscv/include/asm/kasan.h > > @@ -28,7 +28,11 @@ > > #define KASAN_SHADOW_SCALE_SHIFT 3 > > > > #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) > > -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) > > +/* > > + * Depending on the size of the virtual address space, the region may not be > > + * aligned on PGDIR_SIZE, so force its alignment to ease its population. > > + */ > > +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) > > #define KASAN_SHADOW_END MODULES_LOWEST_VADDR > > #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) > > > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > > index e03559f9b35e..d089fe46f7d8 100644 > > --- a/arch/riscv/include/asm/page.h > > +++ b/arch/riscv/include/asm/page.h > > @@ -31,7 +31,20 @@ > > * When not using MMU this corresponds to the first free page in > > * physical memory (aligned on a page boundary). > > */ > > +#ifdef CONFIG_64BIT > > +#ifdef CONFIG_MMU > > +#define PAGE_OFFSET kernel_map.page_offset > > +#else > > +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > > +#endif > > +/* > > + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so > > + * define the PAGE_OFFSET value for SV39. > > + */ > > +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) > > +#else > > #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) > > +#endif /* CONFIG_64BIT */ > > > > /* > > * Half of the kernel address space (half of the entries of the page global > > @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; > > #endif /* CONFIG_MMU */ > > > > struct kernel_mapping { > > + unsigned long page_offset; > > unsigned long virt_addr; > > uintptr_t phys_addr; > > uintptr_t size; > > diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h > > index 0af6933a7100..11823004b87a 100644 > > --- a/arch/riscv/include/asm/pgalloc.h > > +++ b/arch/riscv/include/asm/pgalloc.h > > @@ -11,6 +11,8 @@ > > #include <asm/tlb.h> > > > > #ifdef CONFIG_MMU > > +#define __HAVE_ARCH_PUD_ALLOC_ONE > > +#define __HAVE_ARCH_PUD_FREE > > #include <asm-generic/pgalloc.h> > > > > static inline void pmd_populate_kernel(struct mm_struct *mm, > > @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > > > > set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > > } > > + > > +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) > > +{ > > + if (pgtable_l4_enabled) { > > + unsigned long pfn = virt_to_pfn(pud); > > + > > + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > > + } > > +} > > + > > +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, > > + pud_t *pud) > > +{ > > + if (pgtable_l4_enabled) { > > + unsigned long pfn = virt_to_pfn(pud); > > + > > + set_p4d_safe(p4d, > > + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); > > + } > > +} > > + > > +#define pud_alloc_one pud_alloc_one > > +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) > > +{ > > + if (pgtable_l4_enabled) > > + return __pud_alloc_one(mm, addr); > > + > > + return NULL; > > +} > > + > > +#define pud_free pud_free > > +static inline void pud_free(struct mm_struct *mm, pud_t *pud) > > +{ > > + if (pgtable_l4_enabled) > > + __pud_free(mm, pud); > > +} > > + > > +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) > > #endif /* __PAGETABLE_PMD_FOLDED */ > > > > static inline pgd_t *pgd_alloc(struct mm_struct *mm) > > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h > > index 228261aa9628..bbbdd66e5e2f 100644 > > --- a/arch/riscv/include/asm/pgtable-64.h > > +++ b/arch/riscv/include/asm/pgtable-64.h > > @@ -8,16 +8,36 @@ > > > > #include <linux/const.h> > > > > -#define PGDIR_SHIFT 30 > > +extern bool pgtable_l4_enabled; > > + > > +#define PGDIR_SHIFT_L3 30 > > +#define PGDIR_SHIFT_L4 39 > > +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) > > + > > +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) > > /* Size of region mapped by a page global directory */ > > #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) > > #define PGDIR_MASK (~(PGDIR_SIZE - 1)) > > > > +/* pud is folded into pgd in case of 3-level page table */ > > +#define PUD_SHIFT 30 > > +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) > > +#define PUD_MASK (~(PUD_SIZE - 1)) > > + > > #define PMD_SHIFT 21 > > /* Size of region mapped by a page middle directory */ > > #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) > > #define PMD_MASK (~(PMD_SIZE - 1)) > > > > +/* Page Upper Directory entry */ > > +typedef struct { > > + unsigned long pud; > > +} pud_t; > > + > > +#define pud_val(x) ((x).pud) > > +#define __pud(x) ((pud_t) { (x) }) > > +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) > > + > > /* Page Middle Directory entry */ > > typedef struct { > > unsigned long pmd; > > @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) > > set_pud(pudp, __pud(0)); > > } > > > > +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) > > +{ > > + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > > +} > > + > > +static inline unsigned long _pud_pfn(pud_t pud) > > +{ > > + return pud_val(pud) >> _PAGE_PFN_SHIFT; > > +} > > + > > static inline pmd_t *pud_pgtable(pud_t pud) > > { > > return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); > > @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) > > return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); > > } > > > > +#define mm_pud_folded mm_pud_folded > > +static inline bool mm_pud_folded(struct mm_struct *mm) > > +{ > > + if (pgtable_l4_enabled) > > + return false; > > + > > + return true; > > +} > > + > > +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) > > + > > static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) > > { > > return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); > > @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) > > #define pmd_ERROR(e) \ > > pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) > > > > +#define pud_ERROR(e) \ > > + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) > > + > > +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + *p4dp = p4d; > > + else > > + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); > > +} > > + > > +static inline int p4d_none(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return (p4d_val(p4d) == 0); > > + > > + return 0; > > +} > > + > > +static inline int p4d_present(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return (p4d_val(p4d) & _PAGE_PRESENT); > > + > > + return 1; > > +} > > + > > +static inline int p4d_bad(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return !p4d_present(p4d); > > + > > + return 0; > > +} > > + > > +static inline void p4d_clear(p4d_t *p4d) > > +{ > > + if (pgtable_l4_enabled) > > + set_p4d(p4d, __p4d(0)); > > +} > > + > > +static inline pud_t *p4d_pgtable(p4d_t p4d) > > +{ > > + if (pgtable_l4_enabled) > > + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > > + > > + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); > > +} > > + > > +static inline struct page *p4d_page(p4d_t p4d) > > +{ > > + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); > > +} > > + > > +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) > > + > > +#define pud_offset pud_offset > > +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) > > +{ > > + if (pgtable_l4_enabled) > > + return p4d_pgtable(*p4d) + pud_index(address); > > + > > + return (pud_t *)p4d; > > +} > > + > > #endif /* _ASM_RISCV_PGTABLE_64_H */ > > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > > index e1a52e22ad7e..e1c74ef4ead2 100644 > > --- a/arch/riscv/include/asm/pgtable.h > > +++ b/arch/riscv/include/asm/pgtable.h > > @@ -51,7 +51,7 @@ > > * position vmemmap directly below the VMALLOC region. > > */ > > #ifdef CONFIG_64BIT > > -#define VA_BITS 39 > > +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) > > #else > > #define VA_BITS 32 > > #endif > > @@ -90,8 +90,7 @@ > > > > #ifndef __ASSEMBLY__ > > > > -/* Page Upper Directory not used in RISC-V */ > > -#include <asm-generic/pgtable-nopud.h> > > +#include <asm-generic/pgtable-nop4d.h> > > #include <asm/page.h> > > #include <asm/tlbflush.h> > > #include <linux/mm_types.h> > > @@ -113,6 +112,17 @@ > > #define XIP_FIXUP(addr) (addr) > > #endif /* CONFIG_XIP_KERNEL */ > > > > +struct pt_alloc_ops { > > + pte_t *(*get_pte_virt)(phys_addr_t pa); > > + phys_addr_t (*alloc_pte)(uintptr_t va); > > +#ifndef __PAGETABLE_PMD_FOLDED > > + pmd_t *(*get_pmd_virt)(phys_addr_t pa); > > + phys_addr_t (*alloc_pmd)(uintptr_t va); > > + pud_t *(*get_pud_virt)(phys_addr_t pa); > > + phys_addr_t (*alloc_pud)(uintptr_t va); > > +#endif > > +}; > > + > > #ifdef CONFIG_MMU > > /* Number of entries in the page global directory */ > > #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) > > @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, > > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > > */ > > #ifdef CONFIG_64BIT > > -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > > +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) > > +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) > > #else > > -#define TASK_SIZE FIXADDR_START > > +#define TASK_SIZE FIXADDR_START > > +#define TASK_SIZE_MIN TASK_SIZE > > #endif > > > > #else /* CONFIG_MMU */ > > @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; > > #define dtb_early_va _dtb_early_va > > #define dtb_early_pa _dtb_early_pa > > #endif /* CONFIG_XIP_KERNEL */ > > +extern u64 satp_mode; > > +extern bool pgtable_l4_enabled; > > > > void paging_init(void); > > void misc_mem_init(void); > > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > > index 52c5ff9804c5..c3c0ed559770 100644 > > --- a/arch/riscv/kernel/head.S > > +++ b/arch/riscv/kernel/head.S > > @@ -95,7 +95,8 @@ relocate: > > > > /* Compute satp for kernel page tables, but don't load it yet */ > > srl a2, a0, PAGE_SHIFT > > - li a1, SATP_MODE > > + la a1, satp_mode > > + REG_L a1, 0(a1) > > or a2, a2, a1 > > > > /* > > diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c > > index ee3459cb6750..a7246872bd30 100644 > > --- a/arch/riscv/mm/context.c > > +++ b/arch/riscv/mm/context.c > > @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > > switch_mm_fast: > > csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | > > ((cntx & asid_mask) << SATP_ASID_SHIFT) | > > - SATP_MODE); > > + satp_mode); > > > > if (need_flush_tlb) > > local_flush_tlb_all(); > > @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) > > static void set_mm_noasid(struct mm_struct *mm) > > { > > /* Switch the page table and blindly nuke entire local TLB */ > > - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); > > + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); > > local_flush_tlb_all(); > > } > > > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > > index 1552226fb6bd..6a19a1b1caf8 100644 > > --- a/arch/riscv/mm/init.c > > +++ b/arch/riscv/mm/init.c > > @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); > > #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) > > #endif > > > > +#ifdef CONFIG_64BIT > > +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; > > +#else > > +u64 satp_mode = SATP_MODE_32; > > +#endif > > +EXPORT_SYMBOL(satp_mode); > > + > > +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? > > + true : false; > > Hi Alex, > > I'm not sure whether we can use static key for pgtable_l4_enabled or > not. Obviously, for a specific HW platform, pgtable_l4_enabled won't change > after boot, and it seems it sits hot code path, so IMHO, static key maybe > suitable for it. Thanks for the suggestion, I'll explore that after this series is merged if you don't mind. Thanks, Alex > > Thanks >
Hello Alex, On 12/6/21 12:46, Alexandre Ghiti wrote: > > +#ifdef CONFIG_64BIT > +static void __init disable_pgtable_l4(void) > +{ > + pgtable_l4_enabled = false; > + kernel_map.page_offset = PAGE_OFFSET_L3; > + satp_mode = SATP_MODE_39; > +} > + > +/* > + * There is a simple way to determine if 4-level is supported by the > + * underlying hardware: establish 1:1 mapping in 4-level page table mode > + * then read SATP to see if the configuration was taken into account > + * meaning sv48 is supported. > + */ > +static __init void set_satp_mode(void) > +{ > + u64 identity_satp, hw_satp; > + uintptr_t set_satp_mode_pmd; > + > + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; > + create_pgd_mapping(early_pg_dir, > + set_satp_mode_pmd, (uintptr_t)early_pud, > + PGDIR_SIZE, PAGE_TABLE); > + create_pud_mapping(early_pud, > + set_satp_mode_pmd, (uintptr_t)early_pmd, > + PUD_SIZE, PAGE_TABLE); > + /* Handle the case where set_satp_mode straddles 2 PMDs */ > + create_pmd_mapping(early_pmd, > + set_satp_mode_pmd, set_satp_mode_pmd, > + PMD_SIZE, PAGE_KERNEL_EXEC); > + create_pmd_mapping(early_pmd, > + set_satp_mode_pmd + PMD_SIZE, > + set_satp_mode_pmd + PMD_SIZE, > + PMD_SIZE, PAGE_KERNEL_EXEC); > + > + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; > + > + local_flush_tlb_all(); > + csr_write(CSR_SATP, identity_satp); > + hw_satp = csr_swap(CSR_SATP, 0ULL); > + local_flush_tlb_all(); > + > + if (hw_satp != identity_satp) > + disable_pgtable_l4(); > + > + memset(early_pg_dir, 0, PAGE_SIZE); > + memset(early_pud, 0, PAGE_SIZE); > + memset(early_pmd, 0, PAGE_SIZE); > +} > +#endif > + When doing the 1:1 mapping you don't take into account the limitation that all bits above 47 need to have the same value as bit 47. If the kernel exists at a high physical address with bit 47 set the corresponding virtual address will be invalid, resulting an instruction fetch fault as the privilege spec mandates. We verified this bug on our prototype. I suggest we re-write this in assembly and do a proper satp switch like we do on head.S, so that we don't need the 1:1 mapping and we also have a way to recover in case this fails. Regards, Nick
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index ac6c0cd9bc29..d28fe0148e13 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -150,7 +150,7 @@ config PAGE_OFFSET hex default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffd800000000 if 64BIT + default 0xffffaf8000000000 if 64BIT config KASAN_SHADOW_OFFSET hex @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int - default 3 if 64BIT + default 4 if 64BIT default 2 config LOCKDEP_SUPPORT diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 87ac65696871..3fdb971c7896 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -40,14 +40,13 @@ #ifndef CONFIG_64BIT #define SATP_PPN _AC(0x003FFFFF, UL) #define SATP_MODE_32 _AC(0x80000000, UL) -#define SATP_MODE SATP_MODE_32 #define SATP_ASID_BITS 9 #define SATP_ASID_SHIFT 22 #define SATP_ASID_MASK _AC(0x1FF, UL) #else #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) #define SATP_MODE_39 _AC(0x8000000000000000, UL) -#define SATP_MODE SATP_MODE_39 +#define SATP_MODE_48 _AC(0x9000000000000000, UL) #define SATP_ASID_BITS 16 #define SATP_ASID_SHIFT 44 #define SATP_ASID_MASK _AC(0xFFFF, UL) diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 54cbf07fb4e9..58a718573ad6 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -24,6 +24,7 @@ enum fixed_addresses { FIX_HOLE, FIX_PTE, FIX_PMD, + FIX_PUD, FIX_TEXT_POKE1, FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 743e6ff57996..0b85e363e778 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -28,7 +28,11 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) +/* + * Depending on the size of the virtual address space, the region may not be + * aligned on PGDIR_SIZE, so force its alignment to ease its population. + */ +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index e03559f9b35e..d089fe46f7d8 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -31,7 +31,20 @@ * When not using MMU this corresponds to the first free page in * physical memory (aligned on a page boundary). */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_MMU +#define PAGE_OFFSET kernel_map.page_offset +#else +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif +/* + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so + * define the PAGE_OFFSET value for SV39. + */ +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) +#else #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif /* CONFIG_64BIT */ /* * Half of the kernel address space (half of the entries of the page global @@ -90,6 +103,7 @@ extern unsigned long riscv_pfn_base; #endif /* CONFIG_MMU */ struct kernel_mapping { + unsigned long page_offset; unsigned long virt_addr; uintptr_t phys_addr; uintptr_t size; diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 0af6933a7100..11823004b87a 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -11,6 +11,8 @@ #include <asm/tlb.h> #ifdef CONFIG_MMU +#define __HAVE_ARCH_PUD_ALLOC_ONE +#define __HAVE_ARCH_PUD_FREE #include <asm-generic/pgalloc.h> static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); } + +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, + pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d_safe(p4d, + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +#define pud_alloc_one pud_alloc_one +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + if (pgtable_l4_enabled) + return __pud_alloc_one(mm, addr); + + return NULL; +} + +#define pud_free pud_free +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + if (pgtable_l4_enabled) + __pud_free(mm, pud); +} + +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) #endif /* __PAGETABLE_PMD_FOLDED */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 228261aa9628..bbbdd66e5e2f 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -8,16 +8,36 @@ #include <linux/const.h> -#define PGDIR_SHIFT 30 +extern bool pgtable_l4_enabled; + +#define PGDIR_SHIFT_L3 30 +#define PGDIR_SHIFT_L4 39 +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) + +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) /* Size of region mapped by a page global directory */ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +/* pud is folded into pgd in case of 3-level page table */ +#define PUD_SHIFT 30 +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) + #define PMD_SHIFT 21 /* Size of region mapped by a page middle directory */ #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE - 1)) +/* Page Upper Directory entry */ +typedef struct { + unsigned long pud; +} pud_t; + +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) }) +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) + /* Page Middle Directory entry */ typedef struct { unsigned long pmd; @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) +{ + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); +} + +static inline unsigned long _pud_pfn(pud_t pud) +{ + return pud_val(pud) >> _PAGE_PFN_SHIFT; +} + static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); } +#define mm_pud_folded mm_pud_folded +static inline bool mm_pud_folded(struct mm_struct *mm) +{ + if (pgtable_l4_enabled) + return false; + + return true; +} + +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) + static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) { return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + if (pgtable_l4_enabled) + *p4dp = p4d; + else + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); +} + +static inline int p4d_none(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) == 0); + + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) & _PAGE_PRESENT); + + return 1; +} + +static inline int p4d_bad(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return !p4d_present(p4d); + + return 0; +} + +static inline void p4d_clear(p4d_t *p4d) +{ + if (pgtable_l4_enabled) + set_p4d(p4d, __p4d(0)); +} + +static inline pud_t *p4d_pgtable(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); + + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); +} + +static inline struct page *p4d_page(p4d_t p4d) +{ + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); +} + +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +#define pud_offset pud_offset +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if (pgtable_l4_enabled) + return p4d_pgtable(*p4d) + pud_index(address); + + return (pud_t *)p4d; +} + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e1a52e22ad7e..e1c74ef4ead2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -51,7 +51,7 @@ * position vmemmap directly below the VMALLOC region. */ #ifdef CONFIG_64BIT -#define VA_BITS 39 +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) #else #define VA_BITS 32 #endif @@ -90,8 +90,7 @@ #ifndef __ASSEMBLY__ -/* Page Upper Directory not used in RISC-V */ -#include <asm-generic/pgtable-nopud.h> +#include <asm-generic/pgtable-nop4d.h> #include <asm/page.h> #include <asm/tlbflush.h> #include <linux/mm_types.h> @@ -113,6 +112,17 @@ #define XIP_FIXUP(addr) (addr) #endif /* CONFIG_XIP_KERNEL */ +struct pt_alloc_ops { + pte_t *(*get_pte_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pte)(uintptr_t va); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t *(*get_pmd_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pmd)(uintptr_t va); + pud_t *(*get_pud_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pud)(uintptr_t va); +#endif +}; + #ifdef CONFIG_MMU /* Number of entries in the page global directory */ #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) @@ -669,9 +679,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * Note that PGDIR_SIZE must evenly divide TASK_SIZE. */ #ifdef CONFIG_64BIT -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) #else -#define TASK_SIZE FIXADDR_START +#define TASK_SIZE FIXADDR_START +#define TASK_SIZE_MIN TASK_SIZE #endif #else /* CONFIG_MMU */ @@ -697,6 +709,8 @@ extern uintptr_t _dtb_early_pa; #define dtb_early_va _dtb_early_va #define dtb_early_pa _dtb_early_pa #endif /* CONFIG_XIP_KERNEL */ +extern u64 satp_mode; +extern bool pgtable_l4_enabled; void paging_init(void); void misc_mem_init(void); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 52c5ff9804c5..c3c0ed559770 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -95,7 +95,8 @@ relocate: /* Compute satp for kernel page tables, but don't load it yet */ srl a2, a0, PAGE_SHIFT - li a1, SATP_MODE + la a1, satp_mode + REG_L a1, 0(a1) or a2, a2, a1 /* diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index ee3459cb6750..a7246872bd30 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) switch_mm_fast: csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | ((cntx & asid_mask) << SATP_ASID_SHIFT) | - SATP_MODE); + satp_mode); if (need_flush_tlb) local_flush_tlb_all(); @@ -201,7 +201,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) static void set_mm_noasid(struct mm_struct *mm) { /* Switch the page table and blindly nuke entire local TLB */ - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); local_flush_tlb_all(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 1552226fb6bd..6a19a1b1caf8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) #endif +#ifdef CONFIG_64BIT +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; +#else +u64 satp_mode = SATP_MODE_32; +#endif +EXPORT_SYMBOL(satp_mode); + +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? + true : false; +EXPORT_SYMBOL(pgtable_l4_enabled); + phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); @@ -53,15 +64,6 @@ extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -struct pt_alloc_ops { - pte_t *(*get_pte_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pte)(uintptr_t va); -#ifndef __PAGETABLE_PMD_FOLDED - pmd_t *(*get_pmd_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pmd)(uintptr_t va); -#endif -}; - static phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops _pt_ops __initdata; +struct pt_alloc_ops _pt_ops __initdata; #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) #endif /* CONFIG_XIP_KERNEL */ +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) +#endif /* CONFIG_XIP_KERNEL */ + static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { /* Before MMU is enabled */ @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); return (uintptr_t)early_pmd; } @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, create_pte_mapping(ptep, va, pa, sz, prot); } -#define pgd_next_t pmd_t -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) +static pud_t *__init get_pud_virt_early(phys_addr_t pa) +{ + return (pud_t *)((uintptr_t)pa); +} + +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PUD); + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); +} + +static pud_t *__init get_pud_virt_late(phys_addr_t pa) +{ + return (pud_t *)__va(pa); +} + +static phys_addr_t __init alloc_pud_early(uintptr_t va) +{ + /* Only one PUD is available for early mapping */ + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + + return (uintptr_t)early_pud; +} + +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pud_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + +static void __init create_pud_mapping(pud_t *pudp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pmd_t *nextp; + phys_addr_t next_phys; + uintptr_t pud_index = pud_index(va); + + if (sz == PUD_SIZE) { + if (pud_val(pudp[pud_index]) == 0) + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); + return; + } + + if (pud_val(pudp[pud_index]) == 0) { + next_phys = pt_ops.alloc_pmd(va); + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = pt_ops.get_pmd_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); + nextp = pt_ops.get_pmd_virt(next_phys); + } + + create_pmd_mapping(nextp, va, pa, sz, prot); +} + +#define pgd_next_t pud_t +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pmd + (pgtable_l4_enabled ? \ + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) +#define fixmap_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) +#define trampoline_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pte +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) -#endif +#endif /* __PAGETABLE_PMD_FOLDED */ void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, @@ -493,6 +582,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) } #endif /* CONFIG_STRICT_KERNEL_RWX */ +#ifdef CONFIG_64BIT +static void __init disable_pgtable_l4(void) +{ + pgtable_l4_enabled = false; + kernel_map.page_offset = PAGE_OFFSET_L3; + satp_mode = SATP_MODE_39; +} + +/* + * There is a simple way to determine if 4-level is supported by the + * underlying hardware: establish 1:1 mapping in 4-level page table mode + * then read SATP to see if the configuration was taken into account + * meaning sv48 is supported. + */ +static __init void set_satp_mode(void) +{ + u64 identity_satp, hw_satp; + uintptr_t set_satp_mode_pmd; + + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; + create_pgd_mapping(early_pg_dir, + set_satp_mode_pmd, (uintptr_t)early_pud, + PGDIR_SIZE, PAGE_TABLE); + create_pud_mapping(early_pud, + set_satp_mode_pmd, (uintptr_t)early_pmd, + PUD_SIZE, PAGE_TABLE); + /* Handle the case where set_satp_mode straddles 2 PMDs */ + create_pmd_mapping(early_pmd, + set_satp_mode_pmd, set_satp_mode_pmd, + PMD_SIZE, PAGE_KERNEL_EXEC); + create_pmd_mapping(early_pmd, + set_satp_mode_pmd + PMD_SIZE, + set_satp_mode_pmd + PMD_SIZE, + PMD_SIZE, PAGE_KERNEL_EXEC); + + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; + + local_flush_tlb_all(); + csr_write(CSR_SATP, identity_satp); + hw_satp = csr_swap(CSR_SATP, 0ULL); + local_flush_tlb_all(); + + if (hw_satp != identity_satp) + disable_pgtable_l4(); + + memset(early_pg_dir, 0, PAGE_SIZE); + memset(early_pud, 0, PAGE_SIZE); + memset(early_pmd, 0, PAGE_SIZE); +} +#endif + /* * setup_vm() is called from head.S with MMU-off. * @@ -557,10 +697,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, PGDIR_SIZE, IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); + if (pgtable_l4_enabled) { + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); + } + if (IS_ENABLED(CONFIG_64BIT)) { create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, pa, PMD_SIZE, PAGE_KERNEL); @@ -593,6 +738,8 @@ void pt_ops_set_early(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = alloc_pmd_early; pt_ops.get_pmd_virt = get_pmd_virt_early; + pt_ops.alloc_pud = alloc_pud_early; + pt_ops.get_pud_virt = get_pud_virt_early; #endif } @@ -611,6 +758,8 @@ void pt_ops_set_fixmap(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); #endif } @@ -625,6 +774,8 @@ void pt_ops_set_late(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = alloc_pmd_late; pt_ops.get_pmd_virt = get_pmd_virt_late; + pt_ops.alloc_pud = alloc_pud_late; + pt_ops.get_pud_virt = get_pud_virt_late; #endif } @@ -633,6 +784,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; kernel_map.virt_addr = KERNEL_LINK_ADDR; + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); #ifdef CONFIG_XIP_KERNEL kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; @@ -647,6 +799,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; #endif + +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) + set_satp_mode(); +#endif + kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; @@ -676,15 +833,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); #ifndef __PAGETABLE_PMD_FOLDED - /* Setup fixmap PMD */ + /* Setup fixmap PUD and PMD */ + if (pgtable_l4_enabled) + create_pud_mapping(fixmap_pud, FIXADDR_START, + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); + if (pgtable_l4_enabled) + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); #ifdef CONFIG_XIP_KERNEL create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); @@ -712,7 +875,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap * range can not span multiple pmds. */ - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); #ifndef __PAGETABLE_PMD_FOLDED @@ -783,9 +946,10 @@ static void __init setup_vm_final(void) /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); + clear_fixmap(FIX_PUD); /* Move to swapper page table */ - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); local_flush_tlb_all(); pt_ops_set_late(); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 1434a0225140..993f50571a3b 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -11,7 +11,29 @@ #include <asm/fixmap.h> #include <asm/pgalloc.h> +/* + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 + * which is right before the kernel. + * + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate + * the page global directory with kasan_early_shadow_pmd. + * + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping + * must be divided as follows: + * - the first PGD entry, although incomplete, is populated with + * kasan_early_shadow_pud/p4d + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d + * - the last PGD entry is shared with the kernel mapping so populated at the + * lower levels pud/p4d + * + * In addition, when shallow populating a kasan region (for example vmalloc), + * this region may also not be aligned on PGDIR size, so we must go down to the + * pud level too. + */ + extern pgd_t early_pg_dir[PTRS_PER_PGD]; +extern struct pt_alloc_ops _pt_ops __initdata; +#define pt_ops _pt_ops static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pmd_t *pmdp, *base_pmd; unsigned long next; - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + if (pud_none(*pud)) { base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } else { + base_pmd = (pmd_t *)pud_pgtable(*pud); + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } pmdp = base_pmd + pmd_index(vaddr); @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned * it entirely, memblock could allocate a page at a physical address * where KASAN is not populated yet and then we'd get a page fault. */ - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); +} + +static void __init kasan_populate_pud(pgd_t *pgd, + unsigned long vaddr, unsigned long end, + bool early) +{ + phys_addr_t phys_addr; + pud_t *pudp, *base_pud; + unsigned long next; + + if (early) { + /* + * We can't use pgd_page_vaddr here as it would return a linear + * mapping address but it is not mapped yet, but when populating + * early_pg_dir, we need the physical address and when populating + * swapper_pg_dir, we need the kernel virtual address so use + * pt_ops facility. + */ + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); + } else { + base_pud = (pud_t *)pgd_page_vaddr(*pgd); + if (base_pud == lm_alias(kasan_early_shadow_pud)) + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + } + + pudp = base_pud + pud_index(vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + if (early) { + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } else { + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); + if (phys_addr) { + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } + } + } + + kasan_populate_pmd(pudp, vaddr, next); + } while (pudp++, vaddr = next, vaddr != end); + + /* + * Wait for the whole PGD to be populated before setting the PGD in + * the page table, otherwise, if we did set the PGD before populating + * it entirely, memblock could allocate a page at a physical address + * where KASAN is not populated yet and then we'd get a page fault. + */ + if (!early) + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); } +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)kasan_early_shadow_pud : \ + (uintptr_t)kasan_early_shadow_pmd) +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ + (pgtable_l4_enabled ? \ + kasan_populate_pud(pgdp, vaddr, next, early) : \ + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) + static void __init kasan_populate_pgd(pgd_t *pgdp, unsigned long vaddr, unsigned long end, bool early) @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, } } - kasan_populate_pmd(pgdp, vaddr, next); + kasan_populate_pgd_next(pgdp, vaddr, next, early); } while (pgdp++, vaddr = next, vaddr != end); } @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool kasan_populate) +{ + unsigned long next; + pud_t *pudp, *base_pud; + pmd_t *base_pmd; + bool is_kasan_pmd; + + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); + pudp = base_pud + pud_index(vaddr); + + if (kasan_populate) + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, + sizeof(pud_t) * PTRS_PER_PUD); + + do { + next = pud_addr_end(vaddr, end); + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); + + if (is_kasan_pmd) { + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + } + } while (pudp++, vaddr = next, vaddr != end); +} + static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) { unsigned long next; void *p; pgd_t *pgd_k = pgd_offset_k(vaddr); + bool is_kasan_pgd_next; do { next = pgd_addr_end(vaddr, end); - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); + + if (is_kasan_pgd_next) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } + + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) + continue; + + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); } while (pgd_k++, vaddr = next, vaddr != end); } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 26e69788f27a..b3db5d91ed38 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -40,6 +40,8 @@ #ifdef CONFIG_ARM64 # define EFI_RT_VIRTUAL_LIMIT DEFAULT_MAP_WINDOW_64 +#elif defined(CONFIG_RISCV) +# define EFI_RT_VIRTUAL_LIMIT TASK_SIZE_MIN #else # define EFI_RT_VIRTUAL_LIMIT TASK_SIZE #endif
By adding a new 4th level of page table, give the possibility to 64bit kernel to address 2^48 bytes of virtual address: in practice, that offers 128TB of virtual address space to userspace and allows up to 64TB of physical memory. If the underlying hardware does not support sv48, we will automatically fallback to a standard 3-level page table by folding the new PUD level into PGDIR level. In order to detect HW capabilities at runtime, we use SATP feature that ignores writes with an unsupported mode. Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com> --- arch/riscv/Kconfig | 4 +- arch/riscv/include/asm/csr.h | 3 +- arch/riscv/include/asm/fixmap.h | 1 + arch/riscv/include/asm/kasan.h | 6 +- arch/riscv/include/asm/page.h | 14 ++ arch/riscv/include/asm/pgalloc.h | 40 +++++ arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++- arch/riscv/include/asm/pgtable.h | 24 ++- arch/riscv/kernel/head.S | 3 +- arch/riscv/mm/context.c | 4 +- arch/riscv/mm/init.c | 212 +++++++++++++++++++++--- arch/riscv/mm/kasan_init.c | 137 ++++++++++++++- drivers/firmware/efi/libstub/efi-stub.c | 2 + 13 files changed, 514 insertions(+), 44 deletions(-)