Message ID | ed166c38bbcf82ad58a14353a880d1e208cb2ff1.1690582001.git.sanastasio@raptorengineering.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | xen/ppc: Add early Radix MMU support | expand |
On 29.07.2023 00:21, Shawn Anastasio wrote: > --- /dev/null > +++ b/xen/arch/ppc/include/asm/bitops.h > @@ -0,0 +1,11 @@ > +#ifndef _ASM_PPC_BITOPS_H > +#define _ASM_PPC_BITOPS_H > + > +#include <xen/compiler.h> Not a big deal, but ... > +/* PPC bit number conversion */ > +#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) > +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) > +#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) > + > +#endif /* _ASM_PPC_BITOPS_H */ ... nothing here looks to require that #include. > --- /dev/null > +++ b/xen/arch/ppc/include/asm/mm.h > @@ -0,0 +1,19 @@ > +#ifndef _ASM_PPC_MM_H > +#define _ASM_PPC_MM_H > + > +#include <asm/config.h> This is included by xen/config.h, which in turn is included from the compiler command line. > --- /dev/null > +++ b/xen/arch/ppc/include/asm/page.h > @@ -0,0 +1,178 @@ > +#ifndef _ASM_PPC_PAGE_H > +#define _ASM_PPC_PAGE_H > + > +#include <xen/types.h> > + > +#include <asm/bitops.h> > +#include <asm/byteorder.h> > + > +#define PDE_VALID PPC_BIT(0) > +#define PDE_NLB_MASK 0xfffffffffffffUL > +#define PDE_NLB_SHIFT 5UL > +#define PDE_NLS_MASK 0x1f > + > +#define PTE_VALID PPC_BIT(0) > +#define PTE_LEAF PPC_BIT(1) > +#define PTE_REFERENCE PPC_BIT(55) > +#define PTE_CHANGE PPC_BIT(56) > + > +/* PTE Attributes */ > +#define PTE_ATT_SAO PPC_BIT(59) /* Strong Access Ordering */ > +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58) > +#define PTE_ATT_TOLERANT (PPC_BIT(58) | PPC_BIT(59)) > + > +/* PTE Encoded Access Authority*/ > +#define PTE_EAA_PRIVILEGED PPC_BIT(60) > +#define PTE_EAA_READ PPC_BIT(61) > +#define PTE_EAA_WRITE PPC_BIT(62) > +#define PTE_EAA_EXECUTE PPC_BIT(63) > + > +/* Field shifts/masks */ > +#define PTE_RPN_MASK 0x1fffffffffffUL > +#define PTE_RPN_SHIFT 12UL > +#define PTE_ATT_MASK 0x3UL > +#define PTE_ATT_SHIFT 4UL > +#define PTE_EAA_MASK 0xfUL Did you consider introducing just *_MASK values, utilizing MASK_INSR() and MASK_EXTR() instead of open-coded shifting/masking? > +#define PTE_XEN_BASE (PTE_VALID | PTE_EAA_PRIVILEGED | PTE_REFERENCE) > +#define PTE_XEN_RW (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_WRITE | PTE_CHANGE) > +#define PTE_XEN_RO (PTE_XEN_BASE | PTE_EAA_READ) > +#define PTE_XEN_RX (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_EXECUTE) > + > +/* > + * Radix Tree layout for 64KB pages: > + * > + * [ L1 (ROOT) PAGE DIRECTORY (8192 * sizeof(pde_t)) ] > + * | > + * | > + * v > + * [ L2 PAGE DIRECTORY (512 * sizeof(pde_t)) ] > + * | > + * | > + * v > + * [ L3 PAGE DIRECTORY (512 * sizeof(pde_t)) ] > + * | > + * | > + * v > + * [ L4 PAGE TABLE (32 * sizeof(pte_t)) ] > + * | > + * | > + * v > + * [ PAGE TABLE ENTRY ] > + */ > + > +#define XEN_PT_ENTRIES_LOG2_LVL_1 13 /* 2**13 entries, maps 2**13 * 512GB = 4PB */ > +#define XEN_PT_ENTRIES_LOG2_LVL_2 9 /* 2**9 entries, maps 2**9 * 1GB = 512GB */ > +#define XEN_PT_ENTRIES_LOG2_LVL_3 9 /* 2**9 entries, maps 2**9 * 1GB = 512GB */ > +#define XEN_PT_ENTRIES_LOG2_LVL_4 5 /* 2**5 entries, maps 2**5 * 64K = 2MB */ > + > +#define XEN_PT_SHIFT_LVL_1 (XEN_PT_SHIFT_LVL_2 + XEN_PT_ENTRIES_LOG2_LVL_2) > +#define XEN_PT_SHIFT_LVL_2 (XEN_PT_SHIFT_LVL_3 + XEN_PT_ENTRIES_LOG2_LVL_3) > +#define XEN_PT_SHIFT_LVL_3 (XEN_PT_SHIFT_LVL_4 + XEN_PT_ENTRIES_LOG2_LVL_4) > +#define XEN_PT_SHIFT_LVL_4 PAGE_SHIFT > + > +#define XEN_PT_ENTRIES_LOG2_LVL(lvl) (XEN_PT_ENTRIES_LOG2_LVL_##lvl) > +#define XEN_PT_SHIFT_LVL(lvl) (XEN_PT_SHIFT_LVL_##lvl) > +#define XEN_PT_ENTRIES_LVL(lvl) (1UL << XEN_PT_ENTRIES_LOG2_LVL(lvl)) > +#define XEN_PT_MASK_LVL(lvl) (XEN_PT_ENTRIES_LVL(lvl) - 1) > +#define XEN_PT_INDEX_LVL(lvl, va) ((va >> XEN_PT_SHIFT_LVL(lvl)) & XEN_PT_MASK_LVL(lvl)) > + > +/* > + * Calculate the index of the provided virtual address in the provided > + * page table struct > + */ > +#define pt_index(pt, va) _Generic((pt), \ Earlier on I did ask about the minimum compiler version you require for building the PPC hypervisor. Iirc you said about any, but here you're using another feature where I'm not sure how far back it is available. As indicated back then, it would be nice if ./README could gain respective information. > + struct lvl1_pd * : XEN_PT_INDEX_LVL(1, (va)), \ > + struct lvl2_pd * : XEN_PT_INDEX_LVL(2, (va)), \ > + struct lvl3_pd * : XEN_PT_INDEX_LVL(3, (va)), \ > + struct lvl4_pt * : XEN_PT_INDEX_LVL(4, (va))) > + > +#define pt_entry(pt, va) (&((pt)->entries[pt_index((pt), (va))])) As to style: Here (and elsewhere) you put unnecessary parentheses around two of the three uses of the macro arguments. If you think you want to keep it like that, feel free; in general we're recommending to omit ones not really needed, to help readability. Otoh a little higher up in XEN_PT_INDEX_LVL() parentheses are missing around va. > +typedef struct > +{ > + __be64 pde; > +} pde_t; > + > +typedef struct > +{ > + __be64 pte; > +} pte_t; > + > +struct lvl1_pd > +{ > + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_1]; > +}; > + > +struct lvl2_pd > +{ > + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_2]; > +}; > + > +struct lvl3_pd > +{ > + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_3]; > +}; > + > +struct lvl4_pt > +{ > + pte_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_4]; > +}; > + > +static inline pte_t paddr_to_pte(paddr_t paddr, unsigned long flags) > +{ > + return (pte_t){ .pte = cpu_to_be64(paddr | flags | PTE_LEAF) }; > +} > + > +static inline pde_t paddr_to_pde(paddr_t paddr, unsigned long flags, unsigned long nls) > +{ > + return (pde_t){ .pde = cpu_to_be64(paddr | flags | nls) }; > +} In these two, are you sure you want to demand all callers to only ever pass in page-aligned addresses? > +static inline paddr_t pte_to_paddr(pte_t pte) > +{ > + return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT); > +} > + > +static inline paddr_t pde_to_paddr(pde_t pde) > +{ > + return be64_to_cpu(pde.pde) & (PDE_NLB_MASK << PDE_NLB_SHIFT); > +} > + > +static inline bool pte_is_valid(pte_t pte) > +{ > + return pte.pte & be64_to_cpu(PTE_VALID); > +} > + > +static inline bool pde_is_valid(pde_t pde) > +{ > + return pde.pde & be64_to_cpu(PDE_VALID); > +} > + > +/* > + * ISA 3.0 partition and process table entry format > + */ > +struct patb_entry { > + __be64 patb0; > + __be64 patb1; > +}; > +#define PATB0_HR PPC_BIT(0) /* host uses radix */ > +#define PATB1_GR PPC_BIT(0) /* guest uses radix; must match HR */ > + > +struct prtb_entry { > + __be64 prtb0; > + __be64 reserved; > +}; > + > +/* > + * We support 52 bits, hence: > + * bits 52 - 31 = 21, 0b10101 > + * RTS encoding details > + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long > + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long > + */ > +#define RTS_FIELD ((0x2UL << 61) | (0x5UL << 5)) I'm afraid I can't bring comment and #define together. Bits 0-3 are 4 bits; bits 6-8 are only three. And I'd expect the lower ends of the ranges to appear as shift counts in the expression. > --- a/xen/arch/ppc/include/asm/processor.h > +++ b/xen/arch/ppc/include/asm/processor.h > @@ -133,6 +133,37 @@ struct cpu_user_regs > uint32_t entry_vector; > }; > > +static __inline__ void sync(void) > +{ > + asm volatile ( "sync" ); > +} > + > +static __inline__ void isync(void) > +{ > + asm volatile ( "isync" ); > +} > + > +static inline unsigned long mfmsr(void) > +{ > + unsigned long msr; > + asm volatile ( "mfmsr %0" : "=&r"(msr) ); Why the &? Also nit: Missing blank between closing double quote and opening paren. More instances of this below. > + return msr; > +} > + > +static inline void mtmsrd(unsigned long msr) > +{ > + asm volatile ( "mtmsrd %0" : : "r"(msr) ); > +} > + > +#define mtspr(reg, val) asm volatile ( "mtspr %0,%1" : : "K"(reg), "r"(val) ) > + > +#define mfspr(reg) \ > + ({ \ > + unsigned long val; \ > + asm volatile ( "mfspr %0,%1" : "=r"(val) : "K"(reg) ); \ > + val; \ > + }) Why #define-s here when higher up you successfully use inline functions? Also (nit) blanks would again be nice ahead of the commas. > --- /dev/null > +++ b/xen/arch/ppc/include/asm/regs.h > @@ -0,0 +1,138 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +/* > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. You have an SPDX line first thing - why the spelled out license? > --- /dev/null > +++ b/xen/arch/ppc/mm-radix.c > @@ -0,0 +1,268 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +#include <xen/init.h> > +#include <xen/kernel.h> > +#include <xen/types.h> > +#include <xen/lib.h> Nit: Please sort headers whenever possibly; also ... > +#include <asm/bitops.h> > +#include <asm/byteorder.h> > +#include <asm/early_printk.h> > +#include <asm/mm.h> > +#include <asm/page.h> > +#include <asm/processor.h> > +#include <asm/regs.h> > +#include <asm/msr.h> ... this group then. > +void enable_mmu(void); > + > +#define INITIAL_LVL1_PD_COUNT 1 > +#define INITIAL_LVL2_LVL3_PD_COUNT 2 > +#define INITIAL_LVL4_PT_COUNT 256 > + > +static size_t initial_lvl1_pd_pool_used; > +static struct lvl1_pd __aligned(sizeof(struct lvl1_pd)) > +initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT]; > + > +static size_t initial_lvl2_lvl3_pd_pool_used; > +static struct lvl2_pd __aligned(sizeof(struct lvl2_pd)) > +initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT]; > + > +static size_t initial_lvl4_pt_pool_used; > +static struct lvl4_pt __aligned(sizeof(struct lvl4_pt)) > +initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT]; Can instances of these structures usefully exist and be not suitably aligned? If not, having the types rather than the instances already declare the necessary alignment would seem better to me. > +/* Only reserve minimum Partition and Process tables */ > +#define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 */ > +#define PATB_SIZE (1UL << PATB_SIZE_LOG2) > +#define PRTB_SIZE_LOG2 12 > +#define PRTB_SIZE (1UL << PRTB_SIZE_LOG2) > + > +static struct patb_entry > + __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct patb_entry)]; > + > +static struct prtb_entry > + __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct prtb_entry)]; The situation is different here, as array here aren't embedded into the type (being just single entries). > +_Static_assert(sizeof(initial_patb) == PATB_SIZE); Is this really useful. And if so, why no similar check for prtb? Furthermore BUILD_BUG_ON() would be better, playing again into what the range of suitable compilers is for this to build. > +static void __init setup_initial_mapping(struct lvl1_pd *lvl1, > + vaddr_t map_start, > + vaddr_t map_end, > + paddr_t phys_base) > +{ > + uint64_t page_addr; > + > + if ( map_start & ~PAGE_MASK ) > + { > + early_printk("Xen _start be aligned to 64k (PAGE_SIZE) boundary\n"); > + die(); > + } > + > + if ( (uint64_t) phys_base & ~PAGE_MASK ) Why the cast? > + { > + early_printk("Xen should be loaded at 64k (PAGE_SIZE) boundary\n"); > + die(); > + } > + > + for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE ) > + { > + struct lvl2_pd *lvl2; > + struct lvl3_pd *lvl3; > + struct lvl4_pt *lvl4; > + pde_t *pde; > + pte_t *pte; > + > + /* Allocate LVL 2 PD if necessary */ > + pde = pt_entry(lvl1, page_addr); > + if ( !pde_is_valid(*pde) ) > + { > + lvl2 = lvl2_pd_pool_alloc(); > + *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2); paddr_to_pde() doesn't mask off the top bits. Are you relying on lvl2_pd_pool_alloc() using PIC addressing to get at the value it then returns? Also this and the similar lines below look to be a little too long. > + } > + else > + lvl2 = (struct lvl2_pd *) pde_to_paddr(*pde); This casts a physical address to a pointer; normally only virtual addresses can be used this way. Could there be a helper added to have such a dangerous-looking construct in (ideally) just a single place? > + /* Allocate LVL 3 PD if necessary */ > + pde = pt_entry(lvl2, page_addr); > + if ( !pde_is_valid(*pde) ) > + { > + lvl3 = lvl3_pd_pool_alloc(); > + *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_3); > + } > + else > + lvl3 = (struct lvl3_pd *) pde_to_paddr(*pde); > + > + /* Allocate LVL 4 PT if necessary */ > + pde = pt_entry(lvl3, page_addr); > + if ( !pde_is_valid(*pde) ) > + { > + lvl4 = lvl4_pt_pool_alloc(); > + *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_4); > + } > + else > + lvl4 = (struct lvl4_pt *) pde_to_paddr(*pde); > + > + /* Finally, create PTE in LVL 4 PT */ > + pte = pt_entry(lvl4, page_addr); > + if ( !pte_is_valid(*pte) ) > + { > + unsigned long paddr = (page_addr - map_start) + phys_base; > + unsigned long flags; > + radix_dprint(paddr, "being mapped to "); Blank line again please between declarations and statements. > + radix_dprint(page_addr, "!\n"); > + if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) ) > + { > + radix_dprint(page_addr, "being marked as TEXT (RX)\n"); > + flags = PTE_XEN_RX; > + } > + else if ( is_kernel_rodata(page_addr) ) > + { > + radix_dprint(page_addr, "being marked as RODATA (RO)\n"); > + flags = PTE_XEN_RO; > + } > + else > + { > + radix_dprint(page_addr, "being marked as DEFAULT (RW)\n"); > + flags = PTE_XEN_RW; > + } > + > + *pte = paddr_to_pte(paddr, flags); > + radix_dprint(paddr_to_pte(paddr, flags).pte, > + "is result of PTE map!\n"); > + } > + else > + { > + early_printk("BUG: Tried to create PTE for already-mapped page!"); > + die(); > + } > + } > +} > + > +static void setup_partition_table(struct patb_entry *patb, struct lvl1_pd *root) __init? > +{ > + unsigned long ptcr; > + > + /* Configure entry for LPID 0 to enable Radix and point to root PD */ > + uint64_t patb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1 | PATB0_HR; > + uint64_t patb1 = __pa(initial_prtb) | (PRTB_SIZE_LOG2 - 12) | PATB1_GR; > + patb[0].patb0 = cpu_to_be64(patb0); > + patb[0].patb1 = cpu_to_be64(patb1); > + > + ptcr = __pa(initial_patb) | (PATB_SIZE_LOG2 - 12); > + mtspr(SPRN_PTCR, ptcr); > +} > + > +static void setup_process_table(struct prtb_entry *prtb, struct lvl1_pd *root) __init? > +{ > + /* Configure entry for PID 0 to point to root PD */ > + uint64_t prtb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1; > + initial_prtb[0].prtb0 = cpu_to_be64(prtb0); > +} > + > +void __init setup_initial_pagetables(void) > +{ > + struct lvl1_pd *root = lvl1_pd_pool_alloc(); > + unsigned long lpcr; > + > + setup_initial_mapping(root, > + (vaddr_t) _start, > + (vaddr_t) _end, > + __pa(_start)); setup_initial_mapping(root, (vaddr_t)_start, (vaddr_t)_end, __pa(_start)); (if flowing across multiple lines is needed / wanted) > --- a/xen/arch/ppc/opal.c > +++ b/xen/arch/ppc/opal.c > @@ -1,8 +1,10 @@ > /* SPDX-License-Identifier: GPL-2.0-or-later */ > +#include "xen/compiler.h" Surely <xen/compiler.h>, but then neither this nor ... > #include <asm/boot.h> > #include <asm/early_printk.h> > #include <asm/opal-api.h> > #include <asm/processor.h> > +#include <asm/mm.h> ... this addition look motivated when nothing else changes in this file. Also (I overlooked this in the earlier patch) all xen/ headers generally want to come ahead of all asm/ ones. > --- a/xen/arch/ppc/setup.c > +++ b/xen/arch/ppc/setup.c > @@ -7,6 +7,8 @@ > /* Xen stack for bringing up the first CPU. */ > unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE); > > +void __init setup_initial_pagetables(void); This needs to live in a header which is included both here and in the file defining the function. > +static void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, > + unsigned int pid, unsigned int ric, > + unsigned int prs) > +{ > + unsigned long rb; > + unsigned long rs; > + > + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); > + rs = ((unsigned long) pid << PPC_BITLSHIFT(31)); > + > + asm volatile ( "tlbiel %0, %1, %2, %3, 1" : > + : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory" ); Nit: One too few indenting blanks (besides others that are missing). Perhaps also better to pull the standalone : onto this second line. Overall perhaps (also for line length) asm volatile ( "tlbiel %0, %1, %2, %3, 1" :: "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory" ); > +void radix__tlbiel_all(unsigned int action) Is the double underscore in here intentional? Jan
On 8/1/23 8:18 AM, Jan Beulich wrote: > On 29.07.2023 00:21, Shawn Anastasio wrote: >> --- /dev/null >> +++ b/xen/arch/ppc/include/asm/bitops.h >> @@ -0,0 +1,11 @@ >> +#ifndef _ASM_PPC_BITOPS_H >> +#define _ASM_PPC_BITOPS_H >> + >> +#include <xen/compiler.h> > > Not a big deal, but ... > >> +/* PPC bit number conversion */ >> +#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) >> +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) >> +#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) >> + >> +#endif /* _ASM_PPC_BITOPS_H */ > > ... nothing here looks to require that #include. > Good point, I'll drop it. >> --- /dev/null >> +++ b/xen/arch/ppc/include/asm/mm.h >> @@ -0,0 +1,19 @@ >> +#ifndef _ASM_PPC_MM_H >> +#define _ASM_PPC_MM_H >> + >> +#include <asm/config.h> > > This is included by xen/config.h, which in turn is included from the > compiler command line. > Thanks for letting me know. Will drop. >> --- /dev/null >> +++ b/xen/arch/ppc/include/asm/page.h >> @@ -0,0 +1,178 @@ >> +#ifndef _ASM_PPC_PAGE_H >> +#define _ASM_PPC_PAGE_H >> + >> +#include <xen/types.h> >> + >> +#include <asm/bitops.h> >> +#include <asm/byteorder.h> >> + >> +#define PDE_VALID PPC_BIT(0) >> +#define PDE_NLB_MASK 0xfffffffffffffUL >> +#define PDE_NLB_SHIFT 5UL >> +#define PDE_NLS_MASK 0x1f >> + >> +#define PTE_VALID PPC_BIT(0) >> +#define PTE_LEAF PPC_BIT(1) >> +#define PTE_REFERENCE PPC_BIT(55) >> +#define PTE_CHANGE PPC_BIT(56) >> + >> +/* PTE Attributes */ >> +#define PTE_ATT_SAO PPC_BIT(59) /* Strong Access Ordering */ >> +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58) >> +#define PTE_ATT_TOLERANT (PPC_BIT(58) | PPC_BIT(59)) >> + >> +/* PTE Encoded Access Authority*/ >> +#define PTE_EAA_PRIVILEGED PPC_BIT(60) >> +#define PTE_EAA_READ PPC_BIT(61) >> +#define PTE_EAA_WRITE PPC_BIT(62) >> +#define PTE_EAA_EXECUTE PPC_BIT(63) >> + >> +/* Field shifts/masks */ >> +#define PTE_RPN_MASK 0x1fffffffffffUL >> +#define PTE_RPN_SHIFT 12UL >> +#define PTE_ATT_MASK 0x3UL >> +#define PTE_ATT_SHIFT 4UL >> +#define PTE_EAA_MASK 0xfUL > > Did you consider introducing just *_MASK values, utilizing MASK_INSR() > and MASK_EXTR() instead of open-coded shifting/masking? > I actually wasn't aware of MASK_INSR/MASK_EXTR when writing this. I've just looked into it though, and I don't think using them makes the code much cleaner. Specifically I'm looking at the implementations of `pte_to_paddr` and `pde_to_paddr` which both need to ensure that the returned value retains its original shift. For example, with pte_to_paddr, this change would be: - return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT); + return MASK_EXTR(be64_to_cpu(pte.pte), PTE_RPN_MASK) << PTE_RPN_SHIFT; In addition to updating the definitions of the *_MASK macros to include the right-most padding zeros. - #define PTE_RPN_MASK 0x1fffffffffffUL + #define PTE_RPN_MASK 0x1fffffffffff000UL I don't really think this is an improvement, so I'm tempted to keep the code as-is. If you feel strongly about the usage of MASK_{INSR,EXTR}, though, I could make the change. >> +#define PTE_XEN_BASE (PTE_VALID | PTE_EAA_PRIVILEGED | PTE_REFERENCE) >> +#define PTE_XEN_RW (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_WRITE | PTE_CHANGE) >> +#define PTE_XEN_RO (PTE_XEN_BASE | PTE_EAA_READ) >> +#define PTE_XEN_RX (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_EXECUTE) >> + >> +/* >> + * Radix Tree layout for 64KB pages: >> + * >> + * [ L1 (ROOT) PAGE DIRECTORY (8192 * sizeof(pde_t)) ] >> + * | >> + * | >> + * v >> + * [ L2 PAGE DIRECTORY (512 * sizeof(pde_t)) ] >> + * | >> + * | >> + * v >> + * [ L3 PAGE DIRECTORY (512 * sizeof(pde_t)) ] >> + * | >> + * | >> + * v >> + * [ L4 PAGE TABLE (32 * sizeof(pte_t)) ] >> + * | >> + * | >> + * v >> + * [ PAGE TABLE ENTRY ] >> + */ >> + >> +#define XEN_PT_ENTRIES_LOG2_LVL_1 13 /* 2**13 entries, maps 2**13 * 512GB = 4PB */ >> +#define XEN_PT_ENTRIES_LOG2_LVL_2 9 /* 2**9 entries, maps 2**9 * 1GB = 512GB */ >> +#define XEN_PT_ENTRIES_LOG2_LVL_3 9 /* 2**9 entries, maps 2**9 * 1GB = 512GB */ >> +#define XEN_PT_ENTRIES_LOG2_LVL_4 5 /* 2**5 entries, maps 2**5 * 64K = 2MB */ >> + >> +#define XEN_PT_SHIFT_LVL_1 (XEN_PT_SHIFT_LVL_2 + XEN_PT_ENTRIES_LOG2_LVL_2) >> +#define XEN_PT_SHIFT_LVL_2 (XEN_PT_SHIFT_LVL_3 + XEN_PT_ENTRIES_LOG2_LVL_3) >> +#define XEN_PT_SHIFT_LVL_3 (XEN_PT_SHIFT_LVL_4 + XEN_PT_ENTRIES_LOG2_LVL_4) >> +#define XEN_PT_SHIFT_LVL_4 PAGE_SHIFT >> + >> +#define XEN_PT_ENTRIES_LOG2_LVL(lvl) (XEN_PT_ENTRIES_LOG2_LVL_##lvl) >> +#define XEN_PT_SHIFT_LVL(lvl) (XEN_PT_SHIFT_LVL_##lvl) >> +#define XEN_PT_ENTRIES_LVL(lvl) (1UL << XEN_PT_ENTRIES_LOG2_LVL(lvl)) >> +#define XEN_PT_MASK_LVL(lvl) (XEN_PT_ENTRIES_LVL(lvl) - 1) >> +#define XEN_PT_INDEX_LVL(lvl, va) ((va >> XEN_PT_SHIFT_LVL(lvl)) & XEN_PT_MASK_LVL(lvl)) >> + >> +/* >> + * Calculate the index of the provided virtual address in the provided >> + * page table struct >> + */ >> +#define pt_index(pt, va) _Generic((pt), \ > > Earlier on I did ask about the minimum compiler version you require for > building the PPC hypervisor. Iirc you said about any, but here you're > using another feature where I'm not sure how far back it is available. > As indicated back then, it would be nice if ./README could gain > respective information. > I'm locally building with gcc 10 (shipped with Debian 11), and the Xen CI image for ppc64le builds uses gcc 11 I believe. _Generic and the other features I'm using are certainly available further back, but I haven't personally tested anything earlier. If there's no objections, I'm tempted to just codify gcc 10 as the minimum supported compiler version and leave it up to users if they want to try testing on earlier versions. >> + struct lvl1_pd * : XEN_PT_INDEX_LVL(1, (va)), \ >> + struct lvl2_pd * : XEN_PT_INDEX_LVL(2, (va)), \ >> + struct lvl3_pd * : XEN_PT_INDEX_LVL(3, (va)), \ >> + struct lvl4_pt * : XEN_PT_INDEX_LVL(4, (va))) >> + >> +#define pt_entry(pt, va) (&((pt)->entries[pt_index((pt), (va))])) > > As to style: Here (and elsewhere) you put unnecessary parentheses > around two of the three uses of the macro arguments. If you think you > want to keep it like that, feel free; in general we're recommending to > omit ones not really needed, to help readability. Otoh a little higher > up in XEN_PT_INDEX_LVL() parentheses are missing around va. > If possible I'd prefer to just leave it as-is. Unconditionally parenthesizing macro arguments seems less error-prone as a general policy, though I agree the latter two here are unnecessary. That said, the lack of parenthesis around va in XEN_PT_INDEX_LEVEL was definitely an oversight and I'll fix that. >> +typedef struct >> +{ >> + __be64 pde; >> +} pde_t; >> + >> +typedef struct >> +{ >> + __be64 pte; >> +} pte_t; >> + >> +struct lvl1_pd >> +{ >> + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_1]; >> +}; >> + >> +struct lvl2_pd >> +{ >> + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_2]; >> +}; >> + >> +struct lvl3_pd >> +{ >> + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_3]; >> +}; >> + >> +struct lvl4_pt >> +{ >> + pte_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_4]; >> +}; >> + >> +static inline pte_t paddr_to_pte(paddr_t paddr, unsigned long flags) >> +{ >> + return (pte_t){ .pte = cpu_to_be64(paddr | flags | PTE_LEAF) }; >> +} >> + >> +static inline pde_t paddr_to_pde(paddr_t paddr, unsigned long flags, unsigned long nls) >> +{ >> + return (pde_t){ .pde = cpu_to_be64(paddr | flags | nls) }; >> +} > > In these two, are you sure you want to demand all callers to only ever > pass in page-aligned addresses? > Fair point. Perhaps it would be best to mask the caller-provided address. >> +static inline paddr_t pte_to_paddr(pte_t pte) >> +{ >> + return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT); >> +} >> + >> +static inline paddr_t pde_to_paddr(pde_t pde) >> +{ >> + return be64_to_cpu(pde.pde) & (PDE_NLB_MASK << PDE_NLB_SHIFT); >> +} >> + >> +static inline bool pte_is_valid(pte_t pte) >> +{ >> + return pte.pte & be64_to_cpu(PTE_VALID); >> +} >> + >> +static inline bool pde_is_valid(pde_t pde) >> +{ >> + return pde.pde & be64_to_cpu(PDE_VALID); >> +} >> + >> +/* >> + * ISA 3.0 partition and process table entry format >> + */ >> +struct patb_entry { >> + __be64 patb0; >> + __be64 patb1; >> +}; >> +#define PATB0_HR PPC_BIT(0) /* host uses radix */ >> +#define PATB1_GR PPC_BIT(0) /* guest uses radix; must match HR */ >> + >> +struct prtb_entry { >> + __be64 prtb0; >> + __be64 reserved; >> +}; >> + >> +/* >> + * We support 52 bits, hence: >> + * bits 52 - 31 = 21, 0b10101 >> + * RTS encoding details >> + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long >> + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long >> + */ >> +#define RTS_FIELD ((0x2UL << 61) | (0x5UL << 5)) > > I'm afraid I can't bring comment and #define together. Bits 0-3 are > 4 bits; bits 6-8 are only three. And I'd expect the lower ends of > the ranges to appear as shift counts in the expression. > Yes, I agree that this is confusing. For reference, this comment was verbatim copied from a comment in Linux[*] and I admittedly didn't pay as much attention to it as I did to the expression itself. As far as I can tell, the original author's intent was to use 1-indexed non-PPC-style bit numbering and wrote 0-3 instead of 1-3 as a mistake. Under this interpretation, the comment makes more sense as do the shift counts in the expression being off by one. I'll rewrite it to correct the mistake and use standard 0-indexing for bits. Something like: /* * We support 52 bits, hence: * bits 52 - 31 = 21, 0b10101 * RTS encoding details * bits 0 - 2 of rts -> bits 5 - 7 of unsigned long * bits 3 - 4 of rts -> bits 61 - 62 of unsigned long */ [*] arch/powerpc/include/asm/book3s/64/radix.h:radix__get_tree_size >> --- a/xen/arch/ppc/include/asm/processor.h >> +++ b/xen/arch/ppc/include/asm/processor.h >> @@ -133,6 +133,37 @@ struct cpu_user_regs >> uint32_t entry_vector; >> }; >> >> +static __inline__ void sync(void) >> +{ >> + asm volatile ( "sync" ); >> +} >> + >> +static __inline__ void isync(void) >> +{ >> + asm volatile ( "isync" ); >> +} >> + >> +static inline unsigned long mfmsr(void) >> +{ >> + unsigned long msr; >> + asm volatile ( "mfmsr %0" : "=&r"(msr) ); > > Why the &? This was in the original Xen 3.2 header that this function was taken from, but I agree that it seems strange. I'll drop it in v2. > Also nit: Missing blank between closing double quote and opening paren. > More instances of this below. > Will fix. >> + return msr; >> +} >> + >> +static inline void mtmsrd(unsigned long msr) >> +{ >> + asm volatile ( "mtmsrd %0" : : "r"(msr) ); >> +} >> + >> +#define mtspr(reg, val) asm volatile ( "mtspr %0,%1" : : "K"(reg), "r"(val) ) >> + >> +#define mfspr(reg) \ >> + ({ \ >> + unsigned long val; \ >> + asm volatile ( "mfspr %0,%1" : "=r"(val) : "K"(reg) ); \ >> + val; \ >> + }) > > Why #define-s here when higher up you successfully use inline functions? > This was a temporary thing that I meant to convert into a full function before committing but seem to have overlooked. Thanks for pointing it out. > Also (nit) blanks would again be nice ahead of the commas. > Will do. >> --- /dev/null >> +++ b/xen/arch/ppc/include/asm/regs.h >> @@ -0,0 +1,138 @@ >> +/* SPDX-License-Identifier: GPL-2.0-or-later */ >> +/* >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; if not, write to the Free Software >> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. > > You have an SPDX line first thing - why the spelled out license? > Leftover from importing the header from Xen 3.2. Will drop. >> --- /dev/null >> +++ b/xen/arch/ppc/mm-radix.c >> @@ -0,0 +1,268 @@ >> +/* SPDX-License-Identifier: GPL-2.0-or-later */ >> +#include <xen/init.h> >> +#include <xen/kernel.h> >> +#include <xen/types.h> >> +#include <xen/lib.h> > > Nit: Please sort headers whenever possibly; also ... > >> +#include <asm/bitops.h> >> +#include <asm/byteorder.h> >> +#include <asm/early_printk.h> >> +#include <asm/mm.h> >> +#include <asm/page.h> >> +#include <asm/processor.h> >> +#include <asm/regs.h> >> +#include <asm/msr.h> > > ... this group then. > I'll alphabetically sort both groups. >> +void enable_mmu(void); >> + >> +#define INITIAL_LVL1_PD_COUNT 1 >> +#define INITIAL_LVL2_LVL3_PD_COUNT 2 >> +#define INITIAL_LVL4_PT_COUNT 256 >> + >> +static size_t initial_lvl1_pd_pool_used; >> +static struct lvl1_pd __aligned(sizeof(struct lvl1_pd)) >> +initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT]; >> + >> +static size_t initial_lvl2_lvl3_pd_pool_used; >> +static struct lvl2_pd __aligned(sizeof(struct lvl2_pd)) >> +initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT]; >> + >> +static size_t initial_lvl4_pt_pool_used; >> +static struct lvl4_pt __aligned(sizeof(struct lvl4_pt)) >> +initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT]; > > Can instances of these structures usefully exist and be not suitably > aligned? If not, having the types rather than the instances already > declare the necessary alignment would seem better to me. > I can't think of a scenario in which we'd want instances of these structures without alignment. I'll go ahead and apply the alignment to the struct definitions instead. >> +/* Only reserve minimum Partition and Process tables */ >> +#define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 */ >> +#define PATB_SIZE (1UL << PATB_SIZE_LOG2) >> +#define PRTB_SIZE_LOG2 12 >> +#define PRTB_SIZE (1UL << PRTB_SIZE_LOG2) >> + >> +static struct patb_entry >> + __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct patb_entry)]; >> + >> +static struct prtb_entry >> + __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct prtb_entry)]; > > The situation is different here, as array here aren't embedded into > the type (being just single entries). > Right. >> +_Static_assert(sizeof(initial_patb) == PATB_SIZE); > > Is this really useful. And if so, why no similar check for prtb? > Furthermore BUILD_BUG_ON() would be better, playing again into what > the range of suitable compilers is for this to build. > This was a debug statement that I had inadvertently left in. I'll remove it. >> +static void __init setup_initial_mapping(struct lvl1_pd *lvl1, >> + vaddr_t map_start, >> + vaddr_t map_end, >> + paddr_t phys_base) >> +{ >> + uint64_t page_addr; >> + >> + if ( map_start & ~PAGE_MASK ) >> + { >> + early_printk("Xen _start be aligned to 64k (PAGE_SIZE) boundary\n"); >> + die(); >> + } >> + >> + if ( (uint64_t) phys_base & ~PAGE_MASK ) > > Why the cast? > I believe in an earlier revision of this code I had phys_base's type as void* or similar, so this is just a leftover that I forgot to clean up. I'll remove the cast. >> + { >> + early_printk("Xen should be loaded at 64k (PAGE_SIZE) boundary\n"); >> + die(); >> + } >> + >> + for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE ) >> + { >> + struct lvl2_pd *lvl2; >> + struct lvl3_pd *lvl3; >> + struct lvl4_pt *lvl4; >> + pde_t *pde; >> + pte_t *pte; >> + >> + /* Allocate LVL 2 PD if necessary */ >> + pde = pt_entry(lvl1, page_addr); >> + if ( !pde_is_valid(*pde) ) >> + { >> + lvl2 = lvl2_pd_pool_alloc(); >> + *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2); > > paddr_to_pde() doesn't mask off the top bits. Are you relying on > lvl2_pd_pool_alloc() using PIC addressing to get at the value it > then returns? > I'm not sure I understand the question here. The pointer returned by lvl2_pd_pool_alloc() will indeed have the top address bits set in accordance with the link address, even before paging is enabled because of the relocation code and jump to XEN_VIRT_START in patch 2. This is fine though, since I call __pa() to strip off these bits before passing it to paddr_to_pde. > Also this and the similar lines below look to be a little too long. > Yeah, I noticed this as well, but my understanding was that lines longer than 80 columns were permitted in cases where they don't hurt readability. In any case, I can fix this. >> + } >> + else >> + lvl2 = (struct lvl2_pd *) pde_to_paddr(*pde); > > This casts a physical address to a pointer; normally only virtual > addresses can be used this way. Could there be a helper added to > have such a dangerous-looking construct in (ideally) just a single > place? > Fair point. We already have __va() which will work due to the previously mentioned relocation code in patch 2 even before the MMU is enabled. I think it might make sense to use that. >> + /* Allocate LVL 3 PD if necessary */ >> + pde = pt_entry(lvl2, page_addr); >> + if ( !pde_is_valid(*pde) ) >> + { >> + lvl3 = lvl3_pd_pool_alloc(); >> + *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_3); >> + } >> + else >> + lvl3 = (struct lvl3_pd *) pde_to_paddr(*pde); >> + >> + /* Allocate LVL 4 PT if necessary */ >> + pde = pt_entry(lvl3, page_addr); >> + if ( !pde_is_valid(*pde) ) >> + { >> + lvl4 = lvl4_pt_pool_alloc(); >> + *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_4); >> + } >> + else >> + lvl4 = (struct lvl4_pt *) pde_to_paddr(*pde); >> + >> + /* Finally, create PTE in LVL 4 PT */ >> + pte = pt_entry(lvl4, page_addr); >> + if ( !pte_is_valid(*pte) ) >> + { >> + unsigned long paddr = (page_addr - map_start) + phys_base; >> + unsigned long flags; >> + radix_dprint(paddr, "being mapped to "); > > Blank line again please between declarations and statements. > Will fix. >> + radix_dprint(page_addr, "!\n"); >> + if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) ) >> + { >> + radix_dprint(page_addr, "being marked as TEXT (RX)\n"); >> + flags = PTE_XEN_RX; >> + } >> + else if ( is_kernel_rodata(page_addr) ) >> + { >> + radix_dprint(page_addr, "being marked as RODATA (RO)\n"); >> + flags = PTE_XEN_RO; >> + } >> + else >> + { >> + radix_dprint(page_addr, "being marked as DEFAULT (RW)\n"); >> + flags = PTE_XEN_RW; >> + } >> + >> + *pte = paddr_to_pte(paddr, flags); >> + radix_dprint(paddr_to_pte(paddr, flags).pte, >> + "is result of PTE map!\n"); >> + } >> + else >> + { >> + early_printk("BUG: Tried to create PTE for already-mapped page!"); >> + die(); >> + } >> + } >> +} >> + >> +static void setup_partition_table(struct patb_entry *patb, struct lvl1_pd *root) > > __init? > Yes, good call. Will add. >> +{ >> + unsigned long ptcr; >> + >> + /* Configure entry for LPID 0 to enable Radix and point to root PD */ >> + uint64_t patb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1 | PATB0_HR; >> + uint64_t patb1 = __pa(initial_prtb) | (PRTB_SIZE_LOG2 - 12) | PATB1_GR; >> + patb[0].patb0 = cpu_to_be64(patb0); >> + patb[0].patb1 = cpu_to_be64(patb1); >> + >> + ptcr = __pa(initial_patb) | (PATB_SIZE_LOG2 - 12); >> + mtspr(SPRN_PTCR, ptcr); >> +} >> + >> +static void setup_process_table(struct prtb_entry *prtb, struct lvl1_pd *root) > > __init? > Ditto. >> +{ >> + /* Configure entry for PID 0 to point to root PD */ >> + uint64_t prtb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1; >> + initial_prtb[0].prtb0 = cpu_to_be64(prtb0); >> +} >> + >> +void __init setup_initial_pagetables(void) >> +{ >> + struct lvl1_pd *root = lvl1_pd_pool_alloc(); >> + unsigned long lpcr; >> + >> + setup_initial_mapping(root, >> + (vaddr_t) _start, >> + (vaddr_t) _end, >> + __pa(_start)); > > setup_initial_mapping(root, > (vaddr_t)_start, > (vaddr_t)_end, > __pa(_start)); > > (if flowing across multiple lines is needed / wanted) > I just checked and the line without flowing fits within exactly 80 columns, so I'll make that change. >> --- a/xen/arch/ppc/opal.c >> +++ b/xen/arch/ppc/opal.c >> @@ -1,8 +1,10 @@ >> /* SPDX-License-Identifier: GPL-2.0-or-later */ >> +#include "xen/compiler.h" > > Surely <xen/compiler.h>, but then neither this nor ... > This was accidentally added by my editor/clangd and was not my intention. I'll remove all change to this file in this patch. >> #include <asm/boot.h> >> #include <asm/early_printk.h> >> #include <asm/opal-api.h> >> #include <asm/processor.h> >> +#include <asm/mm.h> > > ... this addition look motivated when nothing else changes in this file. > See above. > Also (I overlooked this in the earlier patch) all xen/ headers generally > want to come ahead of all asm/ ones. > I'll keep that in mind going forward and for the next time I have a valid reason to touch this file. >> --- a/xen/arch/ppc/setup.c >> +++ b/xen/arch/ppc/setup.c >> @@ -7,6 +7,8 @@ >> /* Xen stack for bringing up the first CPU. */ >> unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE); >> >> +void __init setup_initial_pagetables(void); > > This needs to live in a header which is included both here and in the > file defining the function. > I see I already declared the function's prototype in mm.h. This was probably an oversight from before I added that. I'll drop the prototype from setup.c and include asm/mm.h instead. >> +static void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, >> + unsigned int pid, unsigned int ric, >> + unsigned int prs) >> +{ >> + unsigned long rb; >> + unsigned long rs; >> + >> + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); >> + rs = ((unsigned long) pid << PPC_BITLSHIFT(31)); >> + >> + asm volatile ( "tlbiel %0, %1, %2, %3, 1" : >> + : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory" ); > > Nit: One too few indenting blanks (besides others that are missing). > Perhaps also better to pull the standalone : onto this second line. > Overall perhaps (also for line length) > > asm volatile ( "tlbiel %0, %1, %2, %3, 1" > :: "r" (rb), "r" (rs), "i" (ric), "i" (prs) > : "memory" ); > Will do. >> +void radix__tlbiel_all(unsigned int action) > > Is the double underscore in here intentional? > It matches the original Linux file from which this routine was imported (referenced in the file's top-level comment). As I understand it, this is meant to indicate a private function that shouldn't be called outside of radix-specific MMU or TLB code. As Linux's radix support code spans multiple files, a static function wouldn't work for that. > Jan Thanks, Shawn
On 09.08.2023 22:54, Shawn Anastasio wrote: > On 8/1/23 8:18 AM, Jan Beulich wrote: >> On 29.07.2023 00:21, Shawn Anastasio wrote: >>> --- /dev/null >>> +++ b/xen/arch/ppc/include/asm/page.h >>> @@ -0,0 +1,178 @@ >>> +#ifndef _ASM_PPC_PAGE_H >>> +#define _ASM_PPC_PAGE_H >>> + >>> +#include <xen/types.h> >>> + >>> +#include <asm/bitops.h> >>> +#include <asm/byteorder.h> >>> + >>> +#define PDE_VALID PPC_BIT(0) >>> +#define PDE_NLB_MASK 0xfffffffffffffUL >>> +#define PDE_NLB_SHIFT 5UL >>> +#define PDE_NLS_MASK 0x1f >>> + >>> +#define PTE_VALID PPC_BIT(0) >>> +#define PTE_LEAF PPC_BIT(1) >>> +#define PTE_REFERENCE PPC_BIT(55) >>> +#define PTE_CHANGE PPC_BIT(56) >>> + >>> +/* PTE Attributes */ >>> +#define PTE_ATT_SAO PPC_BIT(59) /* Strong Access Ordering */ >>> +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58) >>> +#define PTE_ATT_TOLERANT (PPC_BIT(58) | PPC_BIT(59)) >>> + >>> +/* PTE Encoded Access Authority*/ >>> +#define PTE_EAA_PRIVILEGED PPC_BIT(60) >>> +#define PTE_EAA_READ PPC_BIT(61) >>> +#define PTE_EAA_WRITE PPC_BIT(62) >>> +#define PTE_EAA_EXECUTE PPC_BIT(63) >>> + >>> +/* Field shifts/masks */ >>> +#define PTE_RPN_MASK 0x1fffffffffffUL >>> +#define PTE_RPN_SHIFT 12UL >>> +#define PTE_ATT_MASK 0x3UL >>> +#define PTE_ATT_SHIFT 4UL >>> +#define PTE_EAA_MASK 0xfUL >> >> Did you consider introducing just *_MASK values, utilizing MASK_INSR() >> and MASK_EXTR() instead of open-coded shifting/masking? >> > > I actually wasn't aware of MASK_INSR/MASK_EXTR when writing this. I've > just looked into it though, and I don't think using them makes the code > much cleaner. Specifically I'm looking at the implementations of > `pte_to_paddr` and `pde_to_paddr` which both need to ensure that the > returned value retains its original shift. > > For example, with pte_to_paddr, this change would be: > - return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT); > + return MASK_EXTR(be64_to_cpu(pte.pte), PTE_RPN_MASK) << PTE_RPN_SHIFT; > > In addition to updating the definitions of the *_MASK macros to include > the right-most padding zeros. > > - #define PTE_RPN_MASK 0x1fffffffffffUL > + #define PTE_RPN_MASK 0x1fffffffffff000UL This is the important change. With that the above will be return be64_to_cpu(pte.pte) & PTE_RPN_MASK; and in cases where you want the un-shifted value you'd use MASK_EXTR(). >>> +/* >>> + * Calculate the index of the provided virtual address in the provided >>> + * page table struct >>> + */ >>> +#define pt_index(pt, va) _Generic((pt), \ >> >> Earlier on I did ask about the minimum compiler version you require for >> building the PPC hypervisor. Iirc you said about any, but here you're >> using another feature where I'm not sure how far back it is available. >> As indicated back then, it would be nice if ./README could gain >> respective information. >> > > I'm locally building with gcc 10 (shipped with Debian 11), and the Xen > CI image for ppc64le builds uses gcc 11 I believe. _Generic and the > other features I'm using are certainly available further back, but I > haven't personally tested anything earlier. If there's no objections, > I'm tempted to just codify gcc 10 as the minimum supported compiler > version and leave it up to users if they want to try testing on earlier > versions. Spelling out what you know works is good enough for starters. As you say, people can propose updating when found too high. >>> + for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE ) >>> + { >>> + struct lvl2_pd *lvl2; >>> + struct lvl3_pd *lvl3; >>> + struct lvl4_pt *lvl4; >>> + pde_t *pde; >>> + pte_t *pte; >>> + >>> + /* Allocate LVL 2 PD if necessary */ >>> + pde = pt_entry(lvl1, page_addr); >>> + if ( !pde_is_valid(*pde) ) >>> + { >>> + lvl2 = lvl2_pd_pool_alloc(); >>> + *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2); >> >> paddr_to_pde() doesn't mask off the top bits. Are you relying on >> lvl2_pd_pool_alloc() using PIC addressing to get at the value it >> then returns? >> > > I'm not sure I understand the question here. The pointer returned by > lvl2_pd_pool_alloc() will indeed have the top address bits set in > accordance with the link address, even before paging is enabled because > of the relocation code and jump to XEN_VIRT_START in patch 2. This is > fine though, since I call __pa() to strip off these bits before passing > it to paddr_to_pde. I'm sorry, I managed to overlook the __pa(). >> Also this and the similar lines below look to be a little too long. > > Yeah, I noticed this as well, but my understanding was that lines longer > than 80 columns were permitted in cases where they don't hurt > readability. In any case, I can fix this. There's only one general exception to line limit: printk() (and alike) format strings want to live all on one line (unless there are embedded \n), to allow reasonably grep-ing for them. >>> +void radix__tlbiel_all(unsigned int action) >> >> Is the double underscore in here intentional? >> > > It matches the original Linux file from which this routine was imported > (referenced in the file's top-level comment). > > As I understand it, this is meant to indicate a private function that > shouldn't be called outside of radix-specific MMU or TLB code. As > Linux's radix support code spans multiple files, a static function > wouldn't work for that. We haven't adopted that style yet(?), but I'm also not opposed to you doing locally. Jan
diff --git a/xen/arch/ppc/Makefile b/xen/arch/ppc/Makefile index 0c0a7884a1..a059ac4c0a 100644 --- a/xen/arch/ppc/Makefile +++ b/xen/arch/ppc/Makefile @@ -2,8 +2,10 @@ obj-$(CONFIG_PPC64) += ppc64/ obj-y += boot-of.init.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.init.o +obj-y += mm-radix.o obj-y += opal.o obj-y += setup.o +obj-y += tlb-radix.o $(TARGET): $(TARGET)-syms cp -f $< $@ diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h new file mode 100644 index 0000000000..a7cd8ec7c5 --- /dev/null +++ b/xen/arch/ppc/include/asm/bitops.h @@ -0,0 +1,11 @@ +#ifndef _ASM_PPC_BITOPS_H +#define _ASM_PPC_BITOPS_H + +#include <xen/compiler.h> + +/* PPC bit number conversion */ +#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) +#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) + +#endif /* _ASM_PPC_BITOPS_H */ diff --git a/xen/arch/ppc/include/asm/mm.h b/xen/arch/ppc/include/asm/mm.h new file mode 100644 index 0000000000..36e44a4356 --- /dev/null +++ b/xen/arch/ppc/include/asm/mm.h @@ -0,0 +1,19 @@ +#ifndef _ASM_PPC_MM_H +#define _ASM_PPC_MM_H + +#include <asm/config.h> +#include <asm/page-bits.h> + +#define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT) +#define paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT)) + +#define virt_to_maddr(va) ((paddr_t)((vaddr_t)(va) & PADDR_MASK)) +#define maddr_to_virt(pa) ((void *)((paddr_t)(pa) | XEN_VIRT_START) + +/* Convert between Xen-heap virtual addresses and machine addresses. */ +#define __pa(x) (virt_to_maddr(x)) +#define __va(x) (maddr_to_virt(x)) + +void setup_initial_pagetables(void); + +#endif /* _ASM_PPC_MM_H */ diff --git a/xen/arch/ppc/include/asm/page-bits.h b/xen/arch/ppc/include/asm/page-bits.h index 4c01bf9716..0286177520 100644 --- a/xen/arch/ppc/include/asm/page-bits.h +++ b/xen/arch/ppc/include/asm/page-bits.h @@ -2,6 +2,7 @@ #define __PPC_PAGE_BITS_H__ #define PAGE_SHIFT 16 /* 64 KiB Pages */ -#define PADDR_BITS 48 +#define PADDR_BITS 53 +#define VADDR_BITS 52 #endif /* __PPC_PAGE_BITS_H__ */ diff --git a/xen/arch/ppc/include/asm/page.h b/xen/arch/ppc/include/asm/page.h new file mode 100644 index 0000000000..e58b0a7354 --- /dev/null +++ b/xen/arch/ppc/include/asm/page.h @@ -0,0 +1,178 @@ +#ifndef _ASM_PPC_PAGE_H +#define _ASM_PPC_PAGE_H + +#include <xen/types.h> + +#include <asm/bitops.h> +#include <asm/byteorder.h> + +#define PDE_VALID PPC_BIT(0) +#define PDE_NLB_MASK 0xfffffffffffffUL +#define PDE_NLB_SHIFT 5UL +#define PDE_NLS_MASK 0x1f + +#define PTE_VALID PPC_BIT(0) +#define PTE_LEAF PPC_BIT(1) +#define PTE_REFERENCE PPC_BIT(55) +#define PTE_CHANGE PPC_BIT(56) + +/* PTE Attributes */ +#define PTE_ATT_SAO PPC_BIT(59) /* Strong Access Ordering */ +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58) +#define PTE_ATT_TOLERANT (PPC_BIT(58) | PPC_BIT(59)) + +/* PTE Encoded Access Authority*/ +#define PTE_EAA_PRIVILEGED PPC_BIT(60) +#define PTE_EAA_READ PPC_BIT(61) +#define PTE_EAA_WRITE PPC_BIT(62) +#define PTE_EAA_EXECUTE PPC_BIT(63) + +/* Field shifts/masks */ +#define PTE_RPN_MASK 0x1fffffffffffUL +#define PTE_RPN_SHIFT 12UL +#define PTE_ATT_MASK 0x3UL +#define PTE_ATT_SHIFT 4UL +#define PTE_EAA_MASK 0xfUL + +#define PTE_XEN_BASE (PTE_VALID | PTE_EAA_PRIVILEGED | PTE_REFERENCE) +#define PTE_XEN_RW (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_WRITE | PTE_CHANGE) +#define PTE_XEN_RO (PTE_XEN_BASE | PTE_EAA_READ) +#define PTE_XEN_RX (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_EXECUTE) + +/* + * Radix Tree layout for 64KB pages: + * + * [ L1 (ROOT) PAGE DIRECTORY (8192 * sizeof(pde_t)) ] + * | + * | + * v + * [ L2 PAGE DIRECTORY (512 * sizeof(pde_t)) ] + * | + * | + * v + * [ L3 PAGE DIRECTORY (512 * sizeof(pde_t)) ] + * | + * | + * v + * [ L4 PAGE TABLE (32 * sizeof(pte_t)) ] + * | + * | + * v + * [ PAGE TABLE ENTRY ] + */ + +#define XEN_PT_ENTRIES_LOG2_LVL_1 13 /* 2**13 entries, maps 2**13 * 512GB = 4PB */ +#define XEN_PT_ENTRIES_LOG2_LVL_2 9 /* 2**9 entries, maps 2**9 * 1GB = 512GB */ +#define XEN_PT_ENTRIES_LOG2_LVL_3 9 /* 2**9 entries, maps 2**9 * 1GB = 512GB */ +#define XEN_PT_ENTRIES_LOG2_LVL_4 5 /* 2**5 entries, maps 2**5 * 64K = 2MB */ + +#define XEN_PT_SHIFT_LVL_1 (XEN_PT_SHIFT_LVL_2 + XEN_PT_ENTRIES_LOG2_LVL_2) +#define XEN_PT_SHIFT_LVL_2 (XEN_PT_SHIFT_LVL_3 + XEN_PT_ENTRIES_LOG2_LVL_3) +#define XEN_PT_SHIFT_LVL_3 (XEN_PT_SHIFT_LVL_4 + XEN_PT_ENTRIES_LOG2_LVL_4) +#define XEN_PT_SHIFT_LVL_4 PAGE_SHIFT + +#define XEN_PT_ENTRIES_LOG2_LVL(lvl) (XEN_PT_ENTRIES_LOG2_LVL_##lvl) +#define XEN_PT_SHIFT_LVL(lvl) (XEN_PT_SHIFT_LVL_##lvl) +#define XEN_PT_ENTRIES_LVL(lvl) (1UL << XEN_PT_ENTRIES_LOG2_LVL(lvl)) +#define XEN_PT_MASK_LVL(lvl) (XEN_PT_ENTRIES_LVL(lvl) - 1) +#define XEN_PT_INDEX_LVL(lvl, va) ((va >> XEN_PT_SHIFT_LVL(lvl)) & XEN_PT_MASK_LVL(lvl)) + +/* + * Calculate the index of the provided virtual address in the provided + * page table struct + */ +#define pt_index(pt, va) _Generic((pt), \ + struct lvl1_pd * : XEN_PT_INDEX_LVL(1, (va)), \ + struct lvl2_pd * : XEN_PT_INDEX_LVL(2, (va)), \ + struct lvl3_pd * : XEN_PT_INDEX_LVL(3, (va)), \ + struct lvl4_pt * : XEN_PT_INDEX_LVL(4, (va))) + +#define pt_entry(pt, va) (&((pt)->entries[pt_index((pt), (va))])) + +typedef struct +{ + __be64 pde; +} pde_t; + +typedef struct +{ + __be64 pte; +} pte_t; + +struct lvl1_pd +{ + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_1]; +}; + +struct lvl2_pd +{ + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_2]; +}; + +struct lvl3_pd +{ + pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_3]; +}; + +struct lvl4_pt +{ + pte_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_4]; +}; + +static inline pte_t paddr_to_pte(paddr_t paddr, unsigned long flags) +{ + return (pte_t){ .pte = cpu_to_be64(paddr | flags | PTE_LEAF) }; +} + +static inline pde_t paddr_to_pde(paddr_t paddr, unsigned long flags, unsigned long nls) +{ + return (pde_t){ .pde = cpu_to_be64(paddr | flags | nls) }; +} + +static inline paddr_t pte_to_paddr(pte_t pte) +{ + return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT); +} + +static inline paddr_t pde_to_paddr(pde_t pde) +{ + return be64_to_cpu(pde.pde) & (PDE_NLB_MASK << PDE_NLB_SHIFT); +} + +static inline bool pte_is_valid(pte_t pte) +{ + return pte.pte & be64_to_cpu(PTE_VALID); +} + +static inline bool pde_is_valid(pde_t pde) +{ + return pde.pde & be64_to_cpu(PDE_VALID); +} + +/* + * ISA 3.0 partition and process table entry format + */ +struct patb_entry { + __be64 patb0; + __be64 patb1; +}; +#define PATB0_HR PPC_BIT(0) /* host uses radix */ +#define PATB1_GR PPC_BIT(0) /* guest uses radix; must match HR */ + +struct prtb_entry { + __be64 prtb0; + __be64 reserved; +}; + +/* + * We support 52 bits, hence: + * bits 52 - 31 = 21, 0b10101 + * RTS encoding details + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long + */ +#define RTS_FIELD ((0x2UL << 61) | (0x5UL << 5)) + +void tlbie_all(void); + +#endif /* _ASM_PPC_PAGE_H */ diff --git a/xen/arch/ppc/include/asm/processor.h b/xen/arch/ppc/include/asm/processor.h index 2300640787..417288738c 100644 --- a/xen/arch/ppc/include/asm/processor.h +++ b/xen/arch/ppc/include/asm/processor.h @@ -133,6 +133,37 @@ struct cpu_user_regs uint32_t entry_vector; }; +static __inline__ void sync(void) +{ + asm volatile ( "sync" ); +} + +static __inline__ void isync(void) +{ + asm volatile ( "isync" ); +} + +static inline unsigned long mfmsr(void) +{ + unsigned long msr; + asm volatile ( "mfmsr %0" : "=&r"(msr) ); + return msr; +} + +static inline void mtmsrd(unsigned long msr) +{ + asm volatile ( "mtmsrd %0" : : "r"(msr) ); +} + +#define mtspr(reg, val) asm volatile ( "mtspr %0,%1" : : "K"(reg), "r"(val) ) + +#define mfspr(reg) \ + ({ \ + unsigned long val; \ + asm volatile ( "mfspr %0,%1" : "=r"(val) : "K"(reg) ); \ + val; \ + }) + /* * panic() isn't available at the moment so an infinite loop will be * used temporarily. diff --git a/xen/arch/ppc/include/asm/regs.h b/xen/arch/ppc/include/asm/regs.h new file mode 100644 index 0000000000..d5f47a6ff1 --- /dev/null +++ b/xen/arch/ppc/include/asm/regs.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2005, 2007 + * + * Authors: Jimi Xenidis <jimix@watson.ibm.com> + * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + */ + +#ifndef _ASM_REG_DEFS_H_ +#define _ASM_REG_DEFS_H_ + +/* Special Purpose Registers */ +#define SPRN_VRSAVE 256 +#define SPRN_DSISR 18 +#define SPRN_DAR 19 +#define SPRN_DEC 22 +#define SPRN_SRR0 26 +#define SPRN_SRR1 27 +#define SPRN_TBRL 268 +#define SPRN_TBRU 269 +#define SPRN_SPRG0 272 +#define SPRN_SPRG1 273 +#define SPRN_SPRG2 274 +#define SPRN_SPRG3 275 +#define SPRN_TBWL 284 +#define SPRN_TBWU 285 + +#define SPRN_HSPRG0 304 +#define SPRN_HSPRG1 305 +#define SPRN_HDEC 310 +#define SPRN_HIOR 311 +#define SPRN_RMOR 312 +#define SPRN_HRMOR 313 +#define SPRN_HSRR0 314 +#define SPRN_HSRR1 315 +#define SPRN_LPIDR 319 + +#define SPRN_PTCR 0x1D0 /* Partition table control Register */ +#define SPRN_PID 0x030 /* Process ID */ + +/* Performance monitor spr encodings */ +#define SPRN_MMCRA 786 +#define MMCRA_SAMPHV _AC(0x10000000, UL) /* state of MSR HV when SIAR set */ +#define MMCRA_SAMPPR _AC(0x08000000, UL) /* state of MSR PR when SIAR set */ +#define MMCRA_SAMPLE_ENABLE _AC(0x00000001, UL) /* enable sampling */ +#define NUM_PMCS 8 +#define SPRN_PMC1 787 +#define SPRN_PMC2 788 +#define SPRN_PMC3 789 +#define SPRN_PMC4 790 +#define SPRN_PMC5 791 +#define SPRN_PMC6 792 +#define SPRN_PMC7 793 +#define SPRN_PMC8 794 +#define SPRN_MMCR0 795 +#define MMCR0_FC _AC(0x80000000, UL) /* freeze counters */ +#define MMCR0_FCS _AC(0x40000000, UL) /* freeze in supervisor state */ +#define MMCR0_FCP _AC(0x20000000, UL) /* freeze in problem state */ +#define MMCR0_FCM1 _AC(0x10000000, UL) /* freeze counters while MSR mark = 1 */ +#define MMCR0_FCM0 _AC(0x08000000, UL) /* freeze counters while MSR mark = 0 */ +#define MMCR0_PMAE _AC(0x04000000, UL) /* performance monitor alert enabled */ +#define MMCR0_PMAO _AC(0x00000080, UL) /* performance monitor alert occurred */ +#define MMCR0_FCH _AC(0x00000001, UL) /* freeze conditions in hypervisor */ +#define SPRN_SIAR 796 +#define SPRN_SDAR 797 +#define SPRN_MMCR1 798 + +/* As defined for PU G4 */ +#define SPRN_HID0 1008 +#define SPRN_HID1 1009 +#define SPRN_HID4 1012 + +#define SPRN_DABR 1013 +#define SPRN_HID5 1014 +#define SPRN_DABRX 1015 +#define SPRN_HID6 1017 +#define SPRN_HID7 1018 +#define SPRN_HID8 1019 +#define SPRN_PIR 1023 + +#define SPRN_LPCR 0x13E /* LPAR Control Register */ +#define LPCR_VPM0 _AC(0x8000000000000000, UL) +#define LPCR_VPM1 _AC(0x4000000000000000, UL) +#define LPCR_ISL _AC(0x2000000000000000, UL) +#define LPCR_VC_SH 61 +#define LPCR_DPFD_SH 52 +#define LPCR_DPFD (_AC(7, UL) << LPCR_DPFD_SH) +#define LPCR_VRMASD_SH 47 +#define LPCR_VRMASD (_AC(0x1f, UL) << LPCR_VRMASD_SH) +#define LPCR_VRMA_L _AC(0x0008000000000000, UL) +#define LPCR_VRMA_LP0 _AC(0x0001000000000000, UL) +#define LPCR_VRMA_LP1 _AC(0x0000800000000000, UL) +#define LPCR_RMLS 0x1C000000 /* Implementation dependent RMO limit sel */ +#define LPCR_RMLS_SH 26 +#define LPCR_HAIL _AC(0x0000000004000000, UL) /* HV AIL (ISAv3.1) */ +#define LPCR_ILE _AC(0x0000000002000000, UL) /* !HV irqs set MSR:LE */ +#define LPCR_AIL _AC(0x0000000001800000, UL) /* Alternate interrupt location */ +#define LPCR_AIL_0 _AC(0x0000000000000000, UL) /* MMU off exception offset 0x0 */ +#define LPCR_AIL_3 _AC(0x0000000001800000, UL) /* MMU on exception offset 0xc00...4xxx */ +#define LPCR_ONL _AC(0x0000000000040000, UL) /* online - PURR/SPURR count */ +#define LPCR_LD _AC(0x0000000000020000, UL) /* large decremeter */ +#define LPCR_PECE _AC(0x000000000001f000, UL) /* powersave exit cause enable */ +#define LPCR_PECEDP _AC(0x0000000000010000, UL) /* directed priv dbells cause exit */ +#define LPCR_PECEDH _AC(0x0000000000008000, UL) /* directed hyp dbells cause exit */ +#define LPCR_PECE0 _AC(0x0000000000004000, UL) /* ext. exceptions can cause exit */ +#define LPCR_PECE1 _AC(0x0000000000002000, UL) /* decrementer can cause exit */ +#define LPCR_PECE2 _AC(0x0000000000001000, UL) /* machine check etc can cause exit */ +#define LPCR_PECE_HVEE _AC(0x0000400000000000, UL) /* P9 Wakeup on HV interrupts */ +#define LPCR_MER _AC(0x0000000000000800, UL) /* Mediated External Exception */ +#define LPCR_MER_SH 11 +#define LPCR_GTSE _AC(0x0000000000000400, UL) /* Guest Translation Shootdown Enable */ +#define LPCR_TC _AC(0x0000000000000200, UL) /* Translation control */ +#define LPCR_HEIC _AC(0x0000000000000010, UL) /* Hypervisor External Interrupt Control */ +#define LPCR_LPES 0x0000000c +#define LPCR_LPES0 _AC(0x0000000000000008, UL) /* LPAR Env selector 0 */ +#define LPCR_LPES1 _AC(0x0000000000000004, UL) /* LPAR Env selector 1 */ +#define LPCR_LPES_SH 2 +#define LPCR_RMI _AC(0x0000000000000002, UL) /* real mode is cache inhibit */ +#define LPCR_HVICE _AC(0x0000000000000002, UL) /* P9: HV interrupt enable */ +#define LPCR_HDICE _AC(0x0000000000000001, UL) /* Hyp Decr enable (HV,PR,EE) */ +#define LPCR_UPRT _AC(0x0000000000400000, UL) /* Use Process Table (ISA 3) */ +#define LPCR_HR _AC(0x0000000000100000, UL) + +#endif /* _ASM_REG_DEFS_H_ */ diff --git a/xen/arch/ppc/include/asm/types.h b/xen/arch/ppc/include/asm/types.h index cee08e111a..325a2a27c3 100644 --- a/xen/arch/ppc/include/asm/types.h +++ b/xen/arch/ppc/include/asm/types.h @@ -15,6 +15,7 @@ typedef unsigned int u32; typedef signed long s64; typedef unsigned long u64; typedef unsigned long paddr_t; +typedef unsigned long vaddr_t; #define INVALID_PADDR (~0UL) #define PRIpaddr "016lx" diff --git a/xen/arch/ppc/mm-radix.c b/xen/arch/ppc/mm-radix.c new file mode 100644 index 0000000000..071e71b73e --- /dev/null +++ b/xen/arch/ppc/mm-radix.c @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include <xen/init.h> +#include <xen/kernel.h> +#include <xen/types.h> +#include <xen/lib.h> + +#include <asm/bitops.h> +#include <asm/byteorder.h> +#include <asm/early_printk.h> +#include <asm/mm.h> +#include <asm/page.h> +#include <asm/processor.h> +#include <asm/regs.h> +#include <asm/msr.h> + +void enable_mmu(void); + +#define INITIAL_LVL1_PD_COUNT 1 +#define INITIAL_LVL2_LVL3_PD_COUNT 2 +#define INITIAL_LVL4_PT_COUNT 256 + +static size_t initial_lvl1_pd_pool_used; +static struct lvl1_pd __aligned(sizeof(struct lvl1_pd)) +initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT]; + +static size_t initial_lvl2_lvl3_pd_pool_used; +static struct lvl2_pd __aligned(sizeof(struct lvl2_pd)) +initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT]; + +static size_t initial_lvl4_pt_pool_used; +static struct lvl4_pt __aligned(sizeof(struct lvl4_pt)) +initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT]; + +/* Only reserve minimum Partition and Process tables */ +#define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 */ +#define PATB_SIZE (1UL << PATB_SIZE_LOG2) +#define PRTB_SIZE_LOG2 12 +#define PRTB_SIZE (1UL << PRTB_SIZE_LOG2) + +static struct patb_entry + __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct patb_entry)]; + +static struct prtb_entry + __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct prtb_entry)]; + +_Static_assert(sizeof(initial_patb) == PATB_SIZE); + +static __init struct lvl1_pd *lvl1_pd_pool_alloc(void) +{ + if ( initial_lvl1_pd_pool_used >= INITIAL_LVL1_PD_COUNT ) + { + early_printk("Ran out of space for LVL1 PD!\n"); + die(); + } + + return &initial_lvl1_pd_pool[initial_lvl1_pd_pool_used++]; +} + +static __init struct lvl2_pd *lvl2_pd_pool_alloc(void) +{ + if ( initial_lvl2_lvl3_pd_pool_used >= INITIAL_LVL2_LVL3_PD_COUNT ) + { + early_printk("Ran out of space for LVL2/3 PD!\n"); + die(); + } + + return &initial_lvl2_lvl3_pd_pool[initial_lvl2_lvl3_pd_pool_used++]; +} + +static __init struct lvl3_pd *lvl3_pd_pool_alloc(void) +{ + BUILD_BUG_ON(sizeof(struct lvl3_pd) != sizeof(struct lvl2_pd)); + + return (struct lvl3_pd *) lvl2_pd_pool_alloc(); +} + +static __init struct lvl4_pt *lvl4_pt_pool_alloc(void) +{ + if ( initial_lvl4_pt_pool_used >= INITIAL_LVL4_PT_COUNT ) + { + early_printk("Ran out of space for LVL4 PT!\n"); + die(); + } + + return &initial_lvl4_pt_pool[initial_lvl4_pt_pool_used++]; +} + +#ifndef NDEBUG +/* TODO: Remove once we get common/ building */ +static char *__init itoa64_hex(uint64_t val, char *out_buf, size_t buf_len) +{ + uint64_t cur; + size_t i = buf_len - 1; + + /* Null terminate buffer */ + out_buf[i] = '\0'; + + /* Add digits in reverse */ + cur = val; + while ( cur && i > 0 ) + { + out_buf[--i] = "0123456789ABCDEF"[cur % 16]; + cur /= 16; + } + + /* Pad to 16 digits */ + while ( i > 0 ) + out_buf[--i] = '0'; + + return out_buf + i; +} +#endif + +static void __init radix_dprint(uint64_t addr, const char *msg) +{ +#ifndef NDEBUG + char buf[sizeof("DEADBEEFCAFEBABA")]; + char *addr_s = itoa64_hex(addr, buf, sizeof(buf)); + + early_printk("(0x"); + early_printk(addr_s); + early_printk(") "); + early_printk(msg); +#endif +} + +static void __init setup_initial_mapping(struct lvl1_pd *lvl1, + vaddr_t map_start, + vaddr_t map_end, + paddr_t phys_base) +{ + uint64_t page_addr; + + if ( map_start & ~PAGE_MASK ) + { + early_printk("Xen _start be aligned to 64k (PAGE_SIZE) boundary\n"); + die(); + } + + if ( (uint64_t) phys_base & ~PAGE_MASK ) + { + early_printk("Xen should be loaded at 64k (PAGE_SIZE) boundary\n"); + die(); + } + + for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE ) + { + struct lvl2_pd *lvl2; + struct lvl3_pd *lvl3; + struct lvl4_pt *lvl4; + pde_t *pde; + pte_t *pte; + + /* Allocate LVL 2 PD if necessary */ + pde = pt_entry(lvl1, page_addr); + if ( !pde_is_valid(*pde) ) + { + lvl2 = lvl2_pd_pool_alloc(); + *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2); + } + else + lvl2 = (struct lvl2_pd *) pde_to_paddr(*pde); + + /* Allocate LVL 3 PD if necessary */ + pde = pt_entry(lvl2, page_addr); + if ( !pde_is_valid(*pde) ) + { + lvl3 = lvl3_pd_pool_alloc(); + *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_3); + } + else + lvl3 = (struct lvl3_pd *) pde_to_paddr(*pde); + + /* Allocate LVL 4 PT if necessary */ + pde = pt_entry(lvl3, page_addr); + if ( !pde_is_valid(*pde) ) + { + lvl4 = lvl4_pt_pool_alloc(); + *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_4); + } + else + lvl4 = (struct lvl4_pt *) pde_to_paddr(*pde); + + /* Finally, create PTE in LVL 4 PT */ + pte = pt_entry(lvl4, page_addr); + if ( !pte_is_valid(*pte) ) + { + unsigned long paddr = (page_addr - map_start) + phys_base; + unsigned long flags; + radix_dprint(paddr, "being mapped to "); + radix_dprint(page_addr, "!\n"); + if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) ) + { + radix_dprint(page_addr, "being marked as TEXT (RX)\n"); + flags = PTE_XEN_RX; + } + else if ( is_kernel_rodata(page_addr) ) + { + radix_dprint(page_addr, "being marked as RODATA (RO)\n"); + flags = PTE_XEN_RO; + } + else + { + radix_dprint(page_addr, "being marked as DEFAULT (RW)\n"); + flags = PTE_XEN_RW; + } + + *pte = paddr_to_pte(paddr, flags); + radix_dprint(paddr_to_pte(paddr, flags).pte, + "is result of PTE map!\n"); + } + else + { + early_printk("BUG: Tried to create PTE for already-mapped page!"); + die(); + } + } +} + +static void setup_partition_table(struct patb_entry *patb, struct lvl1_pd *root) +{ + unsigned long ptcr; + + /* Configure entry for LPID 0 to enable Radix and point to root PD */ + uint64_t patb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1 | PATB0_HR; + uint64_t patb1 = __pa(initial_prtb) | (PRTB_SIZE_LOG2 - 12) | PATB1_GR; + patb[0].patb0 = cpu_to_be64(patb0); + patb[0].patb1 = cpu_to_be64(patb1); + + ptcr = __pa(initial_patb) | (PATB_SIZE_LOG2 - 12); + mtspr(SPRN_PTCR, ptcr); +} + +static void setup_process_table(struct prtb_entry *prtb, struct lvl1_pd *root) +{ + /* Configure entry for PID 0 to point to root PD */ + uint64_t prtb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1; + initial_prtb[0].prtb0 = cpu_to_be64(prtb0); +} + +void __init setup_initial_pagetables(void) +{ + struct lvl1_pd *root = lvl1_pd_pool_alloc(); + unsigned long lpcr; + + setup_initial_mapping(root, + (vaddr_t) _start, + (vaddr_t) _end, + __pa(_start)); + + /* Enable Radix mode in LPCR */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + early_printk("Enabled radix in LPCR\n"); + + /* Set up initial process table */ + setup_process_table(initial_prtb, root); + + /* Set up initial partition table */ + setup_partition_table(initial_patb, root); + + /* Flush TLB */ + tlbie_all(); + early_printk("Flushed TLB\n"); + + /* Turn on the MMU */ + enable_mmu(); +} diff --git a/xen/arch/ppc/opal.c b/xen/arch/ppc/opal.c index cc75ccfcbe..727f8726fa 100644 --- a/xen/arch/ppc/opal.c +++ b/xen/arch/ppc/opal.c @@ -1,8 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +#include "xen/compiler.h" #include <asm/boot.h> #include <asm/early_printk.h> #include <asm/opal-api.h> #include <asm/processor.h> +#include <asm/mm.h> #include <xen/types.h> #include <xen/libfdt/libfdt.h> #include <xen/init.h> diff --git a/xen/arch/ppc/ppc64/head.S b/xen/arch/ppc/ppc64/head.S index beff8257fa..5ae85ae123 100644 --- a/xen/arch/ppc/ppc64/head.S +++ b/xen/arch/ppc/ppc64/head.S @@ -2,6 +2,7 @@ #include <asm/asm-defns.h> #include <asm/asm-offsets.h> +#include <asm/msr.h> .section .text.header, "ax", %progbits @@ -67,3 +68,17 @@ ENTRY(start) .size start, . - start .type start, %function + +ENTRY(enable_mmu) + mflr %r3 + mfmsr %r4 + + /* enable instruction and data relocation (MMU) */ + ori %r4, %r4, (MSR_IR | MSR_DR) + + mtsrr0 %r3 /* return to caller after MMU enable */ + mtsrr1 %r4 + rfid + + .size enable_mmu, . - enable_mmu + .type enable_mmu, %function diff --git a/xen/arch/ppc/setup.c b/xen/arch/ppc/setup.c index bacd6d1acd..466993987b 100644 --- a/xen/arch/ppc/setup.c +++ b/xen/arch/ppc/setup.c @@ -7,6 +7,8 @@ /* Xen stack for bringing up the first CPU. */ unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE); +void __init setup_initial_pagetables(void); + void __init noreturn start_xen(unsigned long r3, unsigned long r4, unsigned long r5, unsigned long r6, unsigned long r7) @@ -27,6 +29,8 @@ void __init noreturn start_xen(unsigned long r3, unsigned long r4, boot_opal_init((void *) r3); } + setup_initial_pagetables(); + early_printk("Hello, ppc64le!\n"); for ( ; ; ) diff --git a/xen/arch/ppc/tlb-radix.c b/xen/arch/ppc/tlb-radix.c new file mode 100644 index 0000000000..69934076a7 --- /dev/null +++ b/xen/arch/ppc/tlb-radix.c @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Based on arch/powerpc/mm/book3s64/radix_tlb.c from Linux with the following + * copyright notice: + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + */ +#include <xen/stringify.h> + +#include <asm/bitops.h> +#include <asm/msr.h> +#include <asm/processor.h> + +/* TLB flush actions. Used as argument to tlbiel_flush() */ +enum +{ + TLB_INVAL_SCOPE_LPID, /* invalidate TLBs for current LPID */ + TLB_INVAL_SCOPE_GLOBAL, /* invalidate all TLBs */ +}; + +#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ + +#define RIC_FLUSH_TLB 0 +#define RIC_FLUSH_PWC 1 +#define RIC_FLUSH_ALL 2 + +static void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, unsigned int ric, + unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long) pid << PPC_BITLSHIFT(31)); + + asm volatile ( "tlbiel %0, %1, %2, %3, 1" : + : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory" ); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile ( "ptesync" : : : "memory" ); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + + if ( mfmsr() & MSR_HV ) + { + /* MSR[HV] should flush partition scope translations first. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + + for ( set = 1; set < num_sets; set++ ) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + } + + /* Flush process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + + for ( set = 1; set < num_sets; set++ ) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + + asm volatile ( "ptesync" : : : "memory" ); +} + +void radix__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch ( action ) + { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + die(); + } + + tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is); + + asm volatile ( "slbia 7; isync" : : : "memory" ); +} + +void tlbie_all(void) +{ + radix__tlbiel_all(TLB_INVAL_SCOPE_GLOBAL); +}
Add code to construct early identity-mapped page tables as well as the required process and partition tables to enable the MMU. Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com> --- xen/arch/ppc/Makefile | 2 + xen/arch/ppc/include/asm/bitops.h | 11 ++ xen/arch/ppc/include/asm/mm.h | 19 ++ xen/arch/ppc/include/asm/page-bits.h | 3 +- xen/arch/ppc/include/asm/page.h | 178 ++++++++++++++++++ xen/arch/ppc/include/asm/processor.h | 31 ++++ xen/arch/ppc/include/asm/regs.h | 138 ++++++++++++++ xen/arch/ppc/include/asm/types.h | 1 + xen/arch/ppc/mm-radix.c | 268 +++++++++++++++++++++++++++ xen/arch/ppc/opal.c | 2 + xen/arch/ppc/ppc64/head.S | 15 ++ xen/arch/ppc/setup.c | 4 + xen/arch/ppc/tlb-radix.c | 95 ++++++++++ 13 files changed, 766 insertions(+), 1 deletion(-) create mode 100644 xen/arch/ppc/include/asm/bitops.h create mode 100644 xen/arch/ppc/include/asm/mm.h create mode 100644 xen/arch/ppc/include/asm/page.h create mode 100644 xen/arch/ppc/include/asm/regs.h create mode 100644 xen/arch/ppc/mm-radix.c create mode 100644 xen/arch/ppc/tlb-radix.c