diff mbox series

[3/3] xen/ppc: Implement initial Radix MMU support

Message ID ed166c38bbcf82ad58a14353a880d1e208cb2ff1.1690582001.git.sanastasio@raptorengineering.com (mailing list archive)
State Superseded
Headers show
Series xen/ppc: Add early Radix MMU support | expand

Commit Message

Shawn Anastasio July 28, 2023, 10:21 p.m. UTC
Add code to construct early identity-mapped page tables as well as the
required process and partition tables to enable the MMU.

Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
---
 xen/arch/ppc/Makefile                |   2 +
 xen/arch/ppc/include/asm/bitops.h    |  11 ++
 xen/arch/ppc/include/asm/mm.h        |  19 ++
 xen/arch/ppc/include/asm/page-bits.h |   3 +-
 xen/arch/ppc/include/asm/page.h      | 178 ++++++++++++++++++
 xen/arch/ppc/include/asm/processor.h |  31 ++++
 xen/arch/ppc/include/asm/regs.h      | 138 ++++++++++++++
 xen/arch/ppc/include/asm/types.h     |   1 +
 xen/arch/ppc/mm-radix.c              | 268 +++++++++++++++++++++++++++
 xen/arch/ppc/opal.c                  |   2 +
 xen/arch/ppc/ppc64/head.S            |  15 ++
 xen/arch/ppc/setup.c                 |   4 +
 xen/arch/ppc/tlb-radix.c             |  95 ++++++++++
 13 files changed, 766 insertions(+), 1 deletion(-)
 create mode 100644 xen/arch/ppc/include/asm/bitops.h
 create mode 100644 xen/arch/ppc/include/asm/mm.h
 create mode 100644 xen/arch/ppc/include/asm/page.h
 create mode 100644 xen/arch/ppc/include/asm/regs.h
 create mode 100644 xen/arch/ppc/mm-radix.c
 create mode 100644 xen/arch/ppc/tlb-radix.c

Comments

Jan Beulich Aug. 1, 2023, 1:18 p.m. UTC | #1
On 29.07.2023 00:21, Shawn Anastasio wrote:
> --- /dev/null
> +++ b/xen/arch/ppc/include/asm/bitops.h
> @@ -0,0 +1,11 @@
> +#ifndef _ASM_PPC_BITOPS_H
> +#define _ASM_PPC_BITOPS_H
> +
> +#include <xen/compiler.h>

Not a big deal, but ...

> +/* PPC bit number conversion */
> +#define PPC_BITLSHIFT(be)	(BITS_PER_LONG - 1 - (be))
> +#define PPC_BIT(bit)		(1UL << PPC_BITLSHIFT(bit))
> +#define PPC_BITMASK(bs, be)	((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs))
> +
> +#endif /* _ASM_PPC_BITOPS_H */

... nothing here looks to require that #include.

> --- /dev/null
> +++ b/xen/arch/ppc/include/asm/mm.h
> @@ -0,0 +1,19 @@
> +#ifndef _ASM_PPC_MM_H
> +#define _ASM_PPC_MM_H
> +
> +#include <asm/config.h>

This is included by xen/config.h, which in turn is included from the
compiler command line.

> --- /dev/null
> +++ b/xen/arch/ppc/include/asm/page.h
> @@ -0,0 +1,178 @@
> +#ifndef _ASM_PPC_PAGE_H
> +#define _ASM_PPC_PAGE_H
> +
> +#include <xen/types.h>
> +
> +#include <asm/bitops.h>
> +#include <asm/byteorder.h>
> +
> +#define PDE_VALID     PPC_BIT(0)
> +#define PDE_NLB_MASK  0xfffffffffffffUL
> +#define PDE_NLB_SHIFT 5UL
> +#define PDE_NLS_MASK  0x1f
> +
> +#define PTE_VALID     PPC_BIT(0)
> +#define PTE_LEAF      PPC_BIT(1)
> +#define PTE_REFERENCE PPC_BIT(55)
> +#define PTE_CHANGE    PPC_BIT(56)
> +
> +/* PTE Attributes */
> +#define PTE_ATT_SAO            PPC_BIT(59) /* Strong Access Ordering */
> +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58)
> +#define PTE_ATT_TOLERANT       (PPC_BIT(58) | PPC_BIT(59))
> +
> +/* PTE Encoded Access Authority*/
> +#define PTE_EAA_PRIVILEGED PPC_BIT(60)
> +#define PTE_EAA_READ       PPC_BIT(61)
> +#define PTE_EAA_WRITE      PPC_BIT(62)
> +#define PTE_EAA_EXECUTE    PPC_BIT(63)
> +
> +/* Field shifts/masks */
> +#define PTE_RPN_MASK  0x1fffffffffffUL
> +#define PTE_RPN_SHIFT 12UL
> +#define PTE_ATT_MASK  0x3UL
> +#define PTE_ATT_SHIFT 4UL
> +#define PTE_EAA_MASK  0xfUL

Did you consider introducing just *_MASK values, utilizing MASK_INSR()
and MASK_EXTR() instead of open-coded shifting/masking?

> +#define PTE_XEN_BASE (PTE_VALID | PTE_EAA_PRIVILEGED | PTE_REFERENCE)
> +#define PTE_XEN_RW   (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_WRITE | PTE_CHANGE)
> +#define PTE_XEN_RO   (PTE_XEN_BASE | PTE_EAA_READ)
> +#define PTE_XEN_RX   (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_EXECUTE)
> +
> +/*
> + * Radix Tree layout for 64KB pages:
> + *
> + * [ L1 (ROOT) PAGE DIRECTORY (8192 * sizeof(pde_t)) ]
> + *                     |
> + *                     |
> + *                     v
> + *    [ L2 PAGE DIRECTORY (512 * sizeof(pde_t)) ]
> + *                     |
> + *                     |
> + *                     v
> + *    [ L3 PAGE DIRECTORY (512 * sizeof(pde_t)) ]
> + *                     |
> + *                     |
> + *                     v
> + *      [ L4 PAGE TABLE (32 * sizeof(pte_t)) ]
> + *                     |
> + *                     |
> + *                     v
> + *            [ PAGE TABLE ENTRY ]
> + */
> +
> +#define XEN_PT_ENTRIES_LOG2_LVL_1 13 /* 2**13 entries, maps 2**13 * 512GB = 4PB */
> +#define XEN_PT_ENTRIES_LOG2_LVL_2 9  /* 2**9  entries, maps 2**9  * 1GB = 512GB */
> +#define XEN_PT_ENTRIES_LOG2_LVL_3 9  /* 2**9  entries, maps 2**9  * 1GB = 512GB */
> +#define XEN_PT_ENTRIES_LOG2_LVL_4 5  /* 2**5  entries, maps 2**5  * 64K = 2MB */
> +
> +#define XEN_PT_SHIFT_LVL_1    (XEN_PT_SHIFT_LVL_2 + XEN_PT_ENTRIES_LOG2_LVL_2)
> +#define XEN_PT_SHIFT_LVL_2    (XEN_PT_SHIFT_LVL_3 + XEN_PT_ENTRIES_LOG2_LVL_3)
> +#define XEN_PT_SHIFT_LVL_3    (XEN_PT_SHIFT_LVL_4 + XEN_PT_ENTRIES_LOG2_LVL_4)
> +#define XEN_PT_SHIFT_LVL_4    PAGE_SHIFT
> +
> +#define XEN_PT_ENTRIES_LOG2_LVL(lvl) (XEN_PT_ENTRIES_LOG2_LVL_##lvl)
> +#define XEN_PT_SHIFT_LVL(lvl)        (XEN_PT_SHIFT_LVL_##lvl)
> +#define XEN_PT_ENTRIES_LVL(lvl)      (1UL << XEN_PT_ENTRIES_LOG2_LVL(lvl))
> +#define XEN_PT_MASK_LVL(lvl)         (XEN_PT_ENTRIES_LVL(lvl) - 1)
> +#define XEN_PT_INDEX_LVL(lvl, va)    ((va >> XEN_PT_SHIFT_LVL(lvl)) & XEN_PT_MASK_LVL(lvl))
> +
> +/*
> + * Calculate the index of the provided virtual address in the provided
> + * page table struct
> + */
> +#define pt_index(pt, va) _Generic((pt), \

Earlier on I did ask about the minimum compiler version you require for
building the PPC hypervisor. Iirc you said about any, but here you're
using another feature where I'm not sure how far back it is available.
As indicated back then, it would be nice if ./README could gain
respective information.

> +    struct lvl1_pd * : XEN_PT_INDEX_LVL(1, (va)), \
> +    struct lvl2_pd * : XEN_PT_INDEX_LVL(2, (va)), \
> +    struct lvl3_pd * : XEN_PT_INDEX_LVL(3, (va)), \
> +    struct lvl4_pt * : XEN_PT_INDEX_LVL(4, (va)))
> +
> +#define pt_entry(pt, va) (&((pt)->entries[pt_index((pt), (va))]))

As to style: Here (and elsewhere) you put unnecessary parentheses
around two of the three uses of the macro arguments. If you think you
want to keep it like that, feel free; in general we're recommending to
omit ones not really needed, to help readability. Otoh a little higher
up in XEN_PT_INDEX_LVL() parentheses are missing around va.

> +typedef struct
> +{
> +    __be64 pde;
> +} pde_t;
> +
> +typedef struct
> +{
> +    __be64 pte;
> +} pte_t;
> +
> +struct lvl1_pd
> +{
> +    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_1];
> +};
> +
> +struct lvl2_pd
> +{
> +    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_2];
> +};
> +
> +struct lvl3_pd
> +{
> +    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_3];
> +};
> +
> +struct lvl4_pt
> +{
> +    pte_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_4];
> +};
> +
> +static inline pte_t paddr_to_pte(paddr_t paddr, unsigned long flags)
> +{
> +    return (pte_t){ .pte = cpu_to_be64(paddr | flags | PTE_LEAF) };
> +}
> +
> +static inline pde_t paddr_to_pde(paddr_t paddr, unsigned long flags, unsigned long nls)
> +{
> +    return (pde_t){ .pde = cpu_to_be64(paddr | flags | nls) };
> +}

In these two, are you sure you want to demand all callers to only ever
pass in page-aligned addresses?

> +static inline paddr_t pte_to_paddr(pte_t pte)
> +{
> +    return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT);
> +}
> +
> +static inline paddr_t pde_to_paddr(pde_t pde)
> +{
> +    return be64_to_cpu(pde.pde) & (PDE_NLB_MASK << PDE_NLB_SHIFT);
> +}
> +
> +static inline bool pte_is_valid(pte_t pte)
> +{
> +    return pte.pte & be64_to_cpu(PTE_VALID);
> +}
> +
> +static inline bool pde_is_valid(pde_t pde)
> +{
> +    return pde.pde & be64_to_cpu(PDE_VALID);
> +}
> +
> +/*
> + * ISA 3.0 partition and process table entry format
> + */
> +struct patb_entry {
> +	__be64 patb0;
> +	__be64 patb1;
> +};
> +#define PATB0_HR PPC_BIT(0) /* host uses radix */
> +#define PATB1_GR PPC_BIT(0) /* guest uses radix; must match HR */
> +
> +struct prtb_entry {
> +	__be64 prtb0;
> +	__be64 reserved;
> +};
> +
> +/*
> + * We support 52 bits, hence:
> + * bits 52 - 31 = 21, 0b10101
> + * RTS encoding details
> + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
> + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
> + */
> +#define RTS_FIELD ((0x2UL << 61) | (0x5UL << 5))

I'm afraid I can't bring comment and #define together. Bits 0-3 are
4 bits; bits 6-8 are only three. And I'd expect the lower ends of
the ranges to appear as shift counts in the expression.

> --- a/xen/arch/ppc/include/asm/processor.h
> +++ b/xen/arch/ppc/include/asm/processor.h
> @@ -133,6 +133,37 @@ struct cpu_user_regs
>      uint32_t entry_vector;
>  };
>  
> +static __inline__ void sync(void)
> +{
> +    asm volatile ( "sync" );
> +}
> +
> +static __inline__ void isync(void)
> +{
> +    asm volatile ( "isync" );
> +}
> +
> +static inline unsigned long mfmsr(void)
> +{
> +    unsigned long msr;
> +    asm volatile ( "mfmsr %0" : "=&r"(msr) );

Why the &?

Also nit: Missing blank between closing double quote and opening paren.
More instances of this below.

> +    return msr;
> +}
> +
> +static inline void mtmsrd(unsigned long msr)
> +{
> +    asm volatile ( "mtmsrd %0" : : "r"(msr) );
> +}
> +
> +#define mtspr(reg, val) asm volatile ( "mtspr %0,%1" : : "K"(reg), "r"(val) )
> +
> +#define mfspr(reg)                                                             \
> +    ({                                                                         \
> +        unsigned long val;                                                     \
> +        asm volatile ( "mfspr %0,%1" : "=r"(val) : "K"(reg) );                 \
> +        val;                                                                   \
> +    })

Why #define-s here when higher up you successfully use inline functions?

Also (nit) blanks would again be nice ahead of the commas.

> --- /dev/null
> +++ b/xen/arch/ppc/include/asm/regs.h
> @@ -0,0 +1,138 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

You have an SPDX line first thing - why the spelled out license?

> --- /dev/null
> +++ b/xen/arch/ppc/mm-radix.c
> @@ -0,0 +1,268 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#include <xen/init.h>
> +#include <xen/kernel.h>
> +#include <xen/types.h>
> +#include <xen/lib.h>

Nit: Please sort headers whenever possibly; also ...

> +#include <asm/bitops.h>
> +#include <asm/byteorder.h>
> +#include <asm/early_printk.h>
> +#include <asm/mm.h>
> +#include <asm/page.h>
> +#include <asm/processor.h>
> +#include <asm/regs.h>
> +#include <asm/msr.h>

... this group then.

> +void enable_mmu(void);
> +
> +#define INITIAL_LVL1_PD_COUNT      1
> +#define INITIAL_LVL2_LVL3_PD_COUNT 2
> +#define INITIAL_LVL4_PT_COUNT      256
> +
> +static size_t initial_lvl1_pd_pool_used;
> +static struct lvl1_pd __aligned(sizeof(struct lvl1_pd))
> +initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT];
> +
> +static size_t initial_lvl2_lvl3_pd_pool_used;
> +static struct lvl2_pd __aligned(sizeof(struct lvl2_pd))
> +initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT];
> +
> +static size_t initial_lvl4_pt_pool_used;
> +static struct lvl4_pt __aligned(sizeof(struct lvl4_pt))
> +initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT];

Can instances of these structures usefully exist and be not suitably
aligned? If not, having the types rather than the instances already
declare the necessary alignment would seem better to me.

> +/* Only reserve minimum Partition and Process tables  */
> +#define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 */
> +#define PATB_SIZE      (1UL << PATB_SIZE_LOG2)
> +#define PRTB_SIZE_LOG2 12
> +#define PRTB_SIZE      (1UL << PRTB_SIZE_LOG2)
> +
> +static struct patb_entry
> +    __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct patb_entry)];
> +
> +static struct prtb_entry
> +    __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct prtb_entry)];

The situation is different here, as array here aren't embedded into
the type (being just single entries).

> +_Static_assert(sizeof(initial_patb) == PATB_SIZE);

Is this really useful. And if so, why no similar check for prtb?
Furthermore BUILD_BUG_ON() would be better, playing again into what
the range of suitable compilers is for this to build.

> +static void __init setup_initial_mapping(struct lvl1_pd *lvl1,
> +                                         vaddr_t map_start,
> +                                         vaddr_t map_end,
> +                                         paddr_t phys_base)
> +{
> +    uint64_t page_addr;
> +
> +    if ( map_start & ~PAGE_MASK )
> +    {
> +        early_printk("Xen _start be aligned to 64k (PAGE_SIZE) boundary\n");
> +        die();
> +    }
> +
> +    if ( (uint64_t) phys_base & ~PAGE_MASK )

Why the cast?

> +    {
> +        early_printk("Xen should be loaded at 64k (PAGE_SIZE) boundary\n");
> +        die();
> +    }
> +
> +    for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE )
> +    {
> +        struct lvl2_pd *lvl2;
> +        struct lvl3_pd *lvl3;
> +        struct lvl4_pt *lvl4;
> +        pde_t *pde;
> +        pte_t *pte;
> +
> +        /* Allocate LVL 2 PD if necessary */
> +        pde = pt_entry(lvl1, page_addr);
> +        if ( !pde_is_valid(*pde) )
> +        {
> +            lvl2 = lvl2_pd_pool_alloc();
> +            *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2);

paddr_to_pde() doesn't mask off the top bits. Are you relying on
lvl2_pd_pool_alloc() using PIC addressing to get at the value it
then returns?

Also this and the similar lines below look to be a little too long.

> +        }
> +        else
> +            lvl2 = (struct lvl2_pd *) pde_to_paddr(*pde);

This casts a physical address to a pointer; normally only virtual
addresses can be used this way. Could there be a helper added to
have such a dangerous-looking construct in (ideally) just a single
place?

> +        /* Allocate LVL 3 PD if necessary */
> +        pde = pt_entry(lvl2, page_addr);
> +        if ( !pde_is_valid(*pde) )
> +        {
> +            lvl3 = lvl3_pd_pool_alloc();
> +            *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_3);
> +        }
> +        else
> +            lvl3 = (struct lvl3_pd *) pde_to_paddr(*pde);
> +
> +        /* Allocate LVL 4 PT if necessary */
> +        pde = pt_entry(lvl3, page_addr);
> +        if ( !pde_is_valid(*pde) )
> +        {
> +            lvl4 = lvl4_pt_pool_alloc();
> +            *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_4);
> +        }
> +        else
> +            lvl4 = (struct lvl4_pt *) pde_to_paddr(*pde);
> +
> +        /* Finally, create PTE in LVL 4 PT */
> +        pte = pt_entry(lvl4, page_addr);
> +        if ( !pte_is_valid(*pte) )
> +        {
> +            unsigned long paddr = (page_addr - map_start) + phys_base;
> +            unsigned long flags;
> +            radix_dprint(paddr, "being mapped to ");

Blank line again please between declarations and statements.

> +            radix_dprint(page_addr, "!\n");
> +            if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) )
> +            {
> +                radix_dprint(page_addr, "being marked as TEXT (RX)\n");
> +                flags = PTE_XEN_RX;
> +            }
> +            else if ( is_kernel_rodata(page_addr) )
> +            {
> +                radix_dprint(page_addr, "being marked as RODATA (RO)\n");
> +                flags = PTE_XEN_RO;
> +            }
> +            else
> +            {
> +                radix_dprint(page_addr, "being marked as DEFAULT (RW)\n");
> +                flags = PTE_XEN_RW;
> +            }
> +
> +            *pte = paddr_to_pte(paddr, flags);
> +            radix_dprint(paddr_to_pte(paddr, flags).pte,
> +                             "is result of PTE map!\n");
> +        }
> +        else
> +        {
> +            early_printk("BUG: Tried to create PTE for already-mapped page!");
> +            die();
> +        }
> +    }
> +}
> +
> +static void setup_partition_table(struct patb_entry *patb, struct lvl1_pd *root)

__init?

> +{
> +    unsigned long ptcr;
> +
> +    /* Configure entry for LPID 0 to enable Radix and point to root PD */
> +    uint64_t patb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1 | PATB0_HR;
> +    uint64_t patb1 = __pa(initial_prtb) | (PRTB_SIZE_LOG2 - 12) | PATB1_GR;
> +    patb[0].patb0 = cpu_to_be64(patb0);
> +    patb[0].patb1 = cpu_to_be64(patb1);
> +
> +    ptcr = __pa(initial_patb) | (PATB_SIZE_LOG2 - 12);
> +    mtspr(SPRN_PTCR, ptcr);
> +}
> +
> +static void setup_process_table(struct prtb_entry *prtb, struct lvl1_pd *root)

__init?

> +{
> +    /* Configure entry for PID 0 to point to root PD */
> +    uint64_t prtb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1;
> +    initial_prtb[0].prtb0 = cpu_to_be64(prtb0);
> +}
> +
> +void __init setup_initial_pagetables(void)
> +{
> +    struct lvl1_pd *root = lvl1_pd_pool_alloc();
> +    unsigned long lpcr;
> +
> +    setup_initial_mapping(root,
> +                         (vaddr_t) _start,
> +                         (vaddr_t) _end,
> +                          __pa(_start));

    setup_initial_mapping(root,
                          (vaddr_t)_start,
                          (vaddr_t)_end,
                          __pa(_start));

(if flowing across multiple lines is needed / wanted)

> --- a/xen/arch/ppc/opal.c
> +++ b/xen/arch/ppc/opal.c
> @@ -1,8 +1,10 @@
>  /* SPDX-License-Identifier: GPL-2.0-or-later */
> +#include "xen/compiler.h"

Surely <xen/compiler.h>, but then neither this nor ...

>  #include <asm/boot.h>
>  #include <asm/early_printk.h>
>  #include <asm/opal-api.h>
>  #include <asm/processor.h>
> +#include <asm/mm.h>

... this addition look motivated when nothing else changes in this file.

Also (I overlooked this in the earlier patch) all xen/ headers generally
want to come ahead of all asm/ ones.

> --- a/xen/arch/ppc/setup.c
> +++ b/xen/arch/ppc/setup.c
> @@ -7,6 +7,8 @@
>  /* Xen stack for bringing up the first CPU. */
>  unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE);
>  
> +void __init setup_initial_pagetables(void);

This needs to live in a header which is included both here and in the
file defining the function.

> +static void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
> +                                    unsigned int pid, unsigned int ric,
> +                                    unsigned int prs)
> +{
> +    unsigned long rb;
> +    unsigned long rs;
> +
> +    rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
> +    rs = ((unsigned long) pid << PPC_BITLSHIFT(31));
> +
> +    asm volatile ( "tlbiel %0, %1, %2, %3, 1" :
> +                  : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory" );

Nit: One too few indenting blanks (besides others that are missing).
Perhaps also better to pull the standalone : onto this second line.
Overall perhaps (also for line length)

    asm volatile ( "tlbiel %0, %1, %2, %3, 1"
                   :: "r" (rb), "r" (rs), "i" (ric), "i" (prs)
                   : "memory" );

> +void radix__tlbiel_all(unsigned int action)

Is the double underscore in here intentional?

Jan
Shawn Anastasio Aug. 9, 2023, 8:54 p.m. UTC | #2
On 8/1/23 8:18 AM, Jan Beulich wrote:
> On 29.07.2023 00:21, Shawn Anastasio wrote:
>> --- /dev/null
>> +++ b/xen/arch/ppc/include/asm/bitops.h
>> @@ -0,0 +1,11 @@
>> +#ifndef _ASM_PPC_BITOPS_H
>> +#define _ASM_PPC_BITOPS_H
>> +
>> +#include <xen/compiler.h>
> 
> Not a big deal, but ...
> 
>> +/* PPC bit number conversion */
>> +#define PPC_BITLSHIFT(be)	(BITS_PER_LONG - 1 - (be))
>> +#define PPC_BIT(bit)		(1UL << PPC_BITLSHIFT(bit))
>> +#define PPC_BITMASK(bs, be)	((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs))
>> +
>> +#endif /* _ASM_PPC_BITOPS_H */
> 
> ... nothing here looks to require that #include.
>

Good point, I'll drop it.

>> --- /dev/null
>> +++ b/xen/arch/ppc/include/asm/mm.h
>> @@ -0,0 +1,19 @@
>> +#ifndef _ASM_PPC_MM_H
>> +#define _ASM_PPC_MM_H
>> +
>> +#include <asm/config.h>
> 
> This is included by xen/config.h, which in turn is included from the
> compiler command line.
> 

Thanks for letting me know. Will drop.

>> --- /dev/null
>> +++ b/xen/arch/ppc/include/asm/page.h
>> @@ -0,0 +1,178 @@
>> +#ifndef _ASM_PPC_PAGE_H
>> +#define _ASM_PPC_PAGE_H
>> +
>> +#include <xen/types.h>
>> +
>> +#include <asm/bitops.h>
>> +#include <asm/byteorder.h>
>> +
>> +#define PDE_VALID     PPC_BIT(0)
>> +#define PDE_NLB_MASK  0xfffffffffffffUL
>> +#define PDE_NLB_SHIFT 5UL
>> +#define PDE_NLS_MASK  0x1f
>> +
>> +#define PTE_VALID     PPC_BIT(0)
>> +#define PTE_LEAF      PPC_BIT(1)
>> +#define PTE_REFERENCE PPC_BIT(55)
>> +#define PTE_CHANGE    PPC_BIT(56)
>> +
>> +/* PTE Attributes */
>> +#define PTE_ATT_SAO            PPC_BIT(59) /* Strong Access Ordering */
>> +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58)
>> +#define PTE_ATT_TOLERANT       (PPC_BIT(58) | PPC_BIT(59))
>> +
>> +/* PTE Encoded Access Authority*/
>> +#define PTE_EAA_PRIVILEGED PPC_BIT(60)
>> +#define PTE_EAA_READ       PPC_BIT(61)
>> +#define PTE_EAA_WRITE      PPC_BIT(62)
>> +#define PTE_EAA_EXECUTE    PPC_BIT(63)
>> +
>> +/* Field shifts/masks */
>> +#define PTE_RPN_MASK  0x1fffffffffffUL
>> +#define PTE_RPN_SHIFT 12UL
>> +#define PTE_ATT_MASK  0x3UL
>> +#define PTE_ATT_SHIFT 4UL
>> +#define PTE_EAA_MASK  0xfUL
> 
> Did you consider introducing just *_MASK values, utilizing MASK_INSR()
> and MASK_EXTR() instead of open-coded shifting/masking?
>

I actually wasn't aware of MASK_INSR/MASK_EXTR when writing this. I've
just looked into it though, and I don't think using them makes the code
much cleaner. Specifically I'm looking at the implementations of
`pte_to_paddr` and `pde_to_paddr` which both need to ensure that the
returned value retains its original shift.

For example, with pte_to_paddr, this change would be:
-     return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT);
+     return MASK_EXTR(be64_to_cpu(pte.pte), PTE_RPN_MASK) << PTE_RPN_SHIFT;

In addition to updating the definitions of the *_MASK macros to include
the right-most padding zeros.

- #define PTE_RPN_MASK  0x1fffffffffffUL
+ #define PTE_RPN_MASK  0x1fffffffffff000UL

I don't really think this is an improvement, so I'm tempted to keep the
code as-is. If you feel strongly about the usage of MASK_{INSR,EXTR},
though, I could make the change.

>> +#define PTE_XEN_BASE (PTE_VALID | PTE_EAA_PRIVILEGED | PTE_REFERENCE)
>> +#define PTE_XEN_RW   (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_WRITE | PTE_CHANGE)
>> +#define PTE_XEN_RO   (PTE_XEN_BASE | PTE_EAA_READ)
>> +#define PTE_XEN_RX   (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_EXECUTE)
>> +
>> +/*
>> + * Radix Tree layout for 64KB pages:
>> + *
>> + * [ L1 (ROOT) PAGE DIRECTORY (8192 * sizeof(pde_t)) ]
>> + *                     |
>> + *                     |
>> + *                     v
>> + *    [ L2 PAGE DIRECTORY (512 * sizeof(pde_t)) ]
>> + *                     |
>> + *                     |
>> + *                     v
>> + *    [ L3 PAGE DIRECTORY (512 * sizeof(pde_t)) ]
>> + *                     |
>> + *                     |
>> + *                     v
>> + *      [ L4 PAGE TABLE (32 * sizeof(pte_t)) ]
>> + *                     |
>> + *                     |
>> + *                     v
>> + *            [ PAGE TABLE ENTRY ]
>> + */
>> +
>> +#define XEN_PT_ENTRIES_LOG2_LVL_1 13 /* 2**13 entries, maps 2**13 * 512GB = 4PB */
>> +#define XEN_PT_ENTRIES_LOG2_LVL_2 9  /* 2**9  entries, maps 2**9  * 1GB = 512GB */
>> +#define XEN_PT_ENTRIES_LOG2_LVL_3 9  /* 2**9  entries, maps 2**9  * 1GB = 512GB */
>> +#define XEN_PT_ENTRIES_LOG2_LVL_4 5  /* 2**5  entries, maps 2**5  * 64K = 2MB */
>> +
>> +#define XEN_PT_SHIFT_LVL_1    (XEN_PT_SHIFT_LVL_2 + XEN_PT_ENTRIES_LOG2_LVL_2)
>> +#define XEN_PT_SHIFT_LVL_2    (XEN_PT_SHIFT_LVL_3 + XEN_PT_ENTRIES_LOG2_LVL_3)
>> +#define XEN_PT_SHIFT_LVL_3    (XEN_PT_SHIFT_LVL_4 + XEN_PT_ENTRIES_LOG2_LVL_4)
>> +#define XEN_PT_SHIFT_LVL_4    PAGE_SHIFT
>> +
>> +#define XEN_PT_ENTRIES_LOG2_LVL(lvl) (XEN_PT_ENTRIES_LOG2_LVL_##lvl)
>> +#define XEN_PT_SHIFT_LVL(lvl)        (XEN_PT_SHIFT_LVL_##lvl)
>> +#define XEN_PT_ENTRIES_LVL(lvl)      (1UL << XEN_PT_ENTRIES_LOG2_LVL(lvl))
>> +#define XEN_PT_MASK_LVL(lvl)         (XEN_PT_ENTRIES_LVL(lvl) - 1)
>> +#define XEN_PT_INDEX_LVL(lvl, va)    ((va >> XEN_PT_SHIFT_LVL(lvl)) & XEN_PT_MASK_LVL(lvl))
>> +
>> +/*
>> + * Calculate the index of the provided virtual address in the provided
>> + * page table struct
>> + */
>> +#define pt_index(pt, va) _Generic((pt), \
> 
> Earlier on I did ask about the minimum compiler version you require for
> building the PPC hypervisor. Iirc you said about any, but here you're
> using another feature where I'm not sure how far back it is available.
> As indicated back then, it would be nice if ./README could gain
> respective information.
> 

I'm locally building with gcc 10 (shipped with Debian 11), and the Xen
CI image for ppc64le builds uses gcc 11 I believe. _Generic and the
other features I'm using are certainly available further back, but I
haven't personally tested anything earlier. If there's no objections,
I'm tempted to just codify gcc 10 as the minimum supported compiler
version and leave it up to users if they want to try testing on earlier
versions.

>> +    struct lvl1_pd * : XEN_PT_INDEX_LVL(1, (va)), \
>> +    struct lvl2_pd * : XEN_PT_INDEX_LVL(2, (va)), \
>> +    struct lvl3_pd * : XEN_PT_INDEX_LVL(3, (va)), \
>> +    struct lvl4_pt * : XEN_PT_INDEX_LVL(4, (va)))
>> +
>> +#define pt_entry(pt, va) (&((pt)->entries[pt_index((pt), (va))]))
> 
> As to style: Here (and elsewhere) you put unnecessary parentheses
> around two of the three uses of the macro arguments. If you think you
> want to keep it like that, feel free; in general we're recommending to
> omit ones not really needed, to help readability. Otoh a little higher
> up in XEN_PT_INDEX_LVL() parentheses are missing around va.
>

If possible I'd prefer to just leave it as-is. Unconditionally
parenthesizing macro arguments seems less error-prone as a general
policy, though I agree the latter two here are unnecessary.

That said, the lack of parenthesis around va in XEN_PT_INDEX_LEVEL was
definitely an oversight and I'll fix that.

>> +typedef struct
>> +{
>> +    __be64 pde;
>> +} pde_t;
>> +
>> +typedef struct
>> +{
>> +    __be64 pte;
>> +} pte_t;
>> +
>> +struct lvl1_pd
>> +{
>> +    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_1];
>> +};
>> +
>> +struct lvl2_pd
>> +{
>> +    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_2];
>> +};
>> +
>> +struct lvl3_pd
>> +{
>> +    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_3];
>> +};
>> +
>> +struct lvl4_pt
>> +{
>> +    pte_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_4];
>> +};
>> +
>> +static inline pte_t paddr_to_pte(paddr_t paddr, unsigned long flags)
>> +{
>> +    return (pte_t){ .pte = cpu_to_be64(paddr | flags | PTE_LEAF) };
>> +}
>> +
>> +static inline pde_t paddr_to_pde(paddr_t paddr, unsigned long flags, unsigned long nls)
>> +{
>> +    return (pde_t){ .pde = cpu_to_be64(paddr | flags | nls) };
>> +}
> 
> In these two, are you sure you want to demand all callers to only ever
> pass in page-aligned addresses?
>

Fair point. Perhaps it would be best to mask the caller-provided
address.

>> +static inline paddr_t pte_to_paddr(pte_t pte)
>> +{
>> +    return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT);
>> +}
>> +
>> +static inline paddr_t pde_to_paddr(pde_t pde)
>> +{
>> +    return be64_to_cpu(pde.pde) & (PDE_NLB_MASK << PDE_NLB_SHIFT);
>> +}
>> +
>> +static inline bool pte_is_valid(pte_t pte)
>> +{
>> +    return pte.pte & be64_to_cpu(PTE_VALID);
>> +}
>> +
>> +static inline bool pde_is_valid(pde_t pde)
>> +{
>> +    return pde.pde & be64_to_cpu(PDE_VALID);
>> +}
>> +
>> +/*
>> + * ISA 3.0 partition and process table entry format
>> + */
>> +struct patb_entry {
>> +	__be64 patb0;
>> +	__be64 patb1;
>> +};
>> +#define PATB0_HR PPC_BIT(0) /* host uses radix */
>> +#define PATB1_GR PPC_BIT(0) /* guest uses radix; must match HR */
>> +
>> +struct prtb_entry {
>> +	__be64 prtb0;
>> +	__be64 reserved;
>> +};
>> +
>> +/*
>> + * We support 52 bits, hence:
>> + * bits 52 - 31 = 21, 0b10101
>> + * RTS encoding details
>> + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
>> + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
>> + */
>> +#define RTS_FIELD ((0x2UL << 61) | (0x5UL << 5))
> 
> I'm afraid I can't bring comment and #define together. Bits 0-3 are
> 4 bits; bits 6-8 are only three. And I'd expect the lower ends of
> the ranges to appear as shift counts in the expression.
>

Yes, I agree that this is confusing. For reference, this comment was
verbatim copied from a comment in Linux[*] and I admittedly didn't pay
as much attention to it as I did to the expression itself.

As far as I can tell, the original author's intent was to use 1-indexed
non-PPC-style bit numbering and wrote 0-3 instead of 1-3 as a mistake.
Under this interpretation, the comment makes more sense as do the
shift counts in the expression being off by one.

I'll rewrite it to correct the mistake and use standard 0-indexing for
bits. Something like:

/*
 * We support 52 bits, hence:
 * bits 52 - 31 = 21, 0b10101
 * RTS encoding details
 * bits 0 - 2 of rts -> bits 5 - 7 of unsigned long
 * bits 3 - 4 of rts -> bits 61 - 62 of unsigned long
 */

[*] arch/powerpc/include/asm/book3s/64/radix.h:radix__get_tree_size

>> --- a/xen/arch/ppc/include/asm/processor.h
>> +++ b/xen/arch/ppc/include/asm/processor.h
>> @@ -133,6 +133,37 @@ struct cpu_user_regs
>>      uint32_t entry_vector;
>>  };
>>  
>> +static __inline__ void sync(void)
>> +{
>> +    asm volatile ( "sync" );
>> +}
>> +
>> +static __inline__ void isync(void)
>> +{
>> +    asm volatile ( "isync" );
>> +}
>> +
>> +static inline unsigned long mfmsr(void)
>> +{
>> +    unsigned long msr;
>> +    asm volatile ( "mfmsr %0" : "=&r"(msr) );
> 
> Why the &?

This was in the original Xen 3.2 header that this function was taken
from, but I agree that it seems strange. I'll drop it in v2.

> Also nit: Missing blank between closing double quote and opening paren.
> More instances of this below.
>

Will fix.

>> +    return msr;
>> +}
>> +
>> +static inline void mtmsrd(unsigned long msr)
>> +{
>> +    asm volatile ( "mtmsrd %0" : : "r"(msr) );
>> +}
>> +
>> +#define mtspr(reg, val) asm volatile ( "mtspr %0,%1" : : "K"(reg), "r"(val) )
>> +
>> +#define mfspr(reg)                                                             \
>> +    ({                                                                         \
>> +        unsigned long val;                                                     \
>> +        asm volatile ( "mfspr %0,%1" : "=r"(val) : "K"(reg) );                 \
>> +        val;                                                                   \
>> +    })
> 
> Why #define-s here when higher up you successfully use inline functions?
> 

This was a temporary thing that I meant to convert into a full function
before committing but seem to have overlooked. Thanks for pointing it
out.

> Also (nit) blanks would again be nice ahead of the commas.
> 

Will do.

>> --- /dev/null
>> +++ b/xen/arch/ppc/include/asm/regs.h
>> @@ -0,0 +1,138 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +/*
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> 
> You have an SPDX line first thing - why the spelled out license?
>

Leftover from importing the header from Xen 3.2. Will drop.

>> --- /dev/null
>> +++ b/xen/arch/ppc/mm-radix.c
>> @@ -0,0 +1,268 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#include <xen/init.h>
>> +#include <xen/kernel.h>
>> +#include <xen/types.h>
>> +#include <xen/lib.h>
> 
> Nit: Please sort headers whenever possibly; also ...
> 
>> +#include <asm/bitops.h>
>> +#include <asm/byteorder.h>
>> +#include <asm/early_printk.h>
>> +#include <asm/mm.h>
>> +#include <asm/page.h>
>> +#include <asm/processor.h>
>> +#include <asm/regs.h>
>> +#include <asm/msr.h>
> 
> ... this group then.
>

I'll alphabetically sort both groups.

>> +void enable_mmu(void);
>> +
>> +#define INITIAL_LVL1_PD_COUNT      1
>> +#define INITIAL_LVL2_LVL3_PD_COUNT 2
>> +#define INITIAL_LVL4_PT_COUNT      256
>> +
>> +static size_t initial_lvl1_pd_pool_used;
>> +static struct lvl1_pd __aligned(sizeof(struct lvl1_pd))
>> +initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT];
>> +
>> +static size_t initial_lvl2_lvl3_pd_pool_used;
>> +static struct lvl2_pd __aligned(sizeof(struct lvl2_pd))
>> +initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT];
>> +
>> +static size_t initial_lvl4_pt_pool_used;
>> +static struct lvl4_pt __aligned(sizeof(struct lvl4_pt))
>> +initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT];
> 
> Can instances of these structures usefully exist and be not suitably
> aligned? If not, having the types rather than the instances already
> declare the necessary alignment would seem better to me.
>

I can't think of a scenario in which we'd want instances of these
structures without alignment. I'll go ahead and apply the alignment to
the struct definitions instead.

>> +/* Only reserve minimum Partition and Process tables  */
>> +#define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 */
>> +#define PATB_SIZE      (1UL << PATB_SIZE_LOG2)
>> +#define PRTB_SIZE_LOG2 12
>> +#define PRTB_SIZE      (1UL << PRTB_SIZE_LOG2)
>> +
>> +static struct patb_entry
>> +    __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct patb_entry)];
>> +
>> +static struct prtb_entry
>> +    __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct prtb_entry)];
> 
> The situation is different here, as array here aren't embedded into
> the type (being just single entries).
>

Right.

>> +_Static_assert(sizeof(initial_patb) == PATB_SIZE);
> 
> Is this really useful. And if so, why no similar check for prtb?
> Furthermore BUILD_BUG_ON() would be better, playing again into what
> the range of suitable compilers is for this to build.
>

This was a debug statement that I had inadvertently left in. I'll remove
it.

>> +static void __init setup_initial_mapping(struct lvl1_pd *lvl1,
>> +                                         vaddr_t map_start,
>> +                                         vaddr_t map_end,
>> +                                         paddr_t phys_base)
>> +{
>> +    uint64_t page_addr;
>> +
>> +    if ( map_start & ~PAGE_MASK )
>> +    {
>> +        early_printk("Xen _start be aligned to 64k (PAGE_SIZE) boundary\n");
>> +        die();
>> +    }
>> +
>> +    if ( (uint64_t) phys_base & ~PAGE_MASK )
> 
> Why the cast?
>

I believe in an earlier revision of this code I had phys_base's type as
void* or similar, so this is just a leftover that I forgot to clean up.
I'll remove the cast.

>> +    {
>> +        early_printk("Xen should be loaded at 64k (PAGE_SIZE) boundary\n");
>> +        die();
>> +    }
>> +
>> +    for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE )
>> +    {
>> +        struct lvl2_pd *lvl2;
>> +        struct lvl3_pd *lvl3;
>> +        struct lvl4_pt *lvl4;
>> +        pde_t *pde;
>> +        pte_t *pte;
>> +
>> +        /* Allocate LVL 2 PD if necessary */
>> +        pde = pt_entry(lvl1, page_addr);
>> +        if ( !pde_is_valid(*pde) )
>> +        {
>> +            lvl2 = lvl2_pd_pool_alloc();
>> +            *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2);
> 
> paddr_to_pde() doesn't mask off the top bits. Are you relying on
> lvl2_pd_pool_alloc() using PIC addressing to get at the value it
> then returns?
>

I'm not sure I understand the question here. The pointer returned by
lvl2_pd_pool_alloc() will indeed have the top address bits set in
accordance with the link address, even before paging is enabled because
of the relocation code and jump to XEN_VIRT_START in patch 2. This is
fine though, since I call __pa() to strip off these bits before passing
it to paddr_to_pde.

> Also this and the similar lines below look to be a little too long.
>

Yeah, I noticed this as well, but my understanding was that lines longer
than 80 columns were permitted in cases where they don't hurt
readability. In any case, I can fix this.

>> +        }
>> +        else
>> +            lvl2 = (struct lvl2_pd *) pde_to_paddr(*pde);
> 
> This casts a physical address to a pointer; normally only virtual
> addresses can be used this way. Could there be a helper added to
> have such a dangerous-looking construct in (ideally) just a single
> place?
>

Fair point. We already have __va() which will work due to the previously
mentioned relocation code in patch 2 even before the MMU is enabled. I
think it might make sense to use that.

>> +        /* Allocate LVL 3 PD if necessary */
>> +        pde = pt_entry(lvl2, page_addr);
>> +        if ( !pde_is_valid(*pde) )
>> +        {
>> +            lvl3 = lvl3_pd_pool_alloc();
>> +            *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_3);
>> +        }
>> +        else
>> +            lvl3 = (struct lvl3_pd *) pde_to_paddr(*pde);
>> +
>> +        /* Allocate LVL 4 PT if necessary */
>> +        pde = pt_entry(lvl3, page_addr);
>> +        if ( !pde_is_valid(*pde) )
>> +        {
>> +            lvl4 = lvl4_pt_pool_alloc();
>> +            *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_4);
>> +        }
>> +        else
>> +            lvl4 = (struct lvl4_pt *) pde_to_paddr(*pde);
>> +
>> +        /* Finally, create PTE in LVL 4 PT */
>> +        pte = pt_entry(lvl4, page_addr);
>> +        if ( !pte_is_valid(*pte) )
>> +        {
>> +            unsigned long paddr = (page_addr - map_start) + phys_base;
>> +            unsigned long flags;
>> +            radix_dprint(paddr, "being mapped to ");
> 
> Blank line again please between declarations and statements.
>

Will fix.

>> +            radix_dprint(page_addr, "!\n");
>> +            if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) )
>> +            {
>> +                radix_dprint(page_addr, "being marked as TEXT (RX)\n");
>> +                flags = PTE_XEN_RX;
>> +            }
>> +            else if ( is_kernel_rodata(page_addr) )
>> +            {
>> +                radix_dprint(page_addr, "being marked as RODATA (RO)\n");
>> +                flags = PTE_XEN_RO;
>> +            }
>> +            else
>> +            {
>> +                radix_dprint(page_addr, "being marked as DEFAULT (RW)\n");
>> +                flags = PTE_XEN_RW;
>> +            }
>> +
>> +            *pte = paddr_to_pte(paddr, flags);
>> +            radix_dprint(paddr_to_pte(paddr, flags).pte,
>> +                             "is result of PTE map!\n");
>> +        }
>> +        else
>> +        {
>> +            early_printk("BUG: Tried to create PTE for already-mapped page!");
>> +            die();
>> +        }
>> +    }
>> +}
>> +
>> +static void setup_partition_table(struct patb_entry *patb, struct lvl1_pd *root)
> 
> __init?
>

Yes, good call. Will add.

>> +{
>> +    unsigned long ptcr;
>> +
>> +    /* Configure entry for LPID 0 to enable Radix and point to root PD */
>> +    uint64_t patb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1 | PATB0_HR;
>> +    uint64_t patb1 = __pa(initial_prtb) | (PRTB_SIZE_LOG2 - 12) | PATB1_GR;
>> +    patb[0].patb0 = cpu_to_be64(patb0);
>> +    patb[0].patb1 = cpu_to_be64(patb1);
>> +
>> +    ptcr = __pa(initial_patb) | (PATB_SIZE_LOG2 - 12);
>> +    mtspr(SPRN_PTCR, ptcr);
>> +}
>> +
>> +static void setup_process_table(struct prtb_entry *prtb, struct lvl1_pd *root)
> 
> __init?
>

Ditto.

>> +{
>> +    /* Configure entry for PID 0 to point to root PD */
>> +    uint64_t prtb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1;
>> +    initial_prtb[0].prtb0 = cpu_to_be64(prtb0);
>> +}
>> +
>> +void __init setup_initial_pagetables(void)
>> +{
>> +    struct lvl1_pd *root = lvl1_pd_pool_alloc();
>> +    unsigned long lpcr;
>> +
>> +    setup_initial_mapping(root,
>> +                         (vaddr_t) _start,
>> +                         (vaddr_t) _end,
>> +                          __pa(_start));
> 
>     setup_initial_mapping(root,
>                           (vaddr_t)_start,
>                           (vaddr_t)_end,
>                           __pa(_start));
> 
> (if flowing across multiple lines is needed / wanted)
>

I just checked and the line without flowing fits within exactly 80
columns, so I'll make that change.

>> --- a/xen/arch/ppc/opal.c
>> +++ b/xen/arch/ppc/opal.c
>> @@ -1,8 +1,10 @@
>>  /* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#include "xen/compiler.h"
> 
> Surely <xen/compiler.h>, but then neither this nor ...
>

This was accidentally added by my editor/clangd and was not my
intention. I'll remove all change to this file in this patch.

>>  #include <asm/boot.h>
>>  #include <asm/early_printk.h>
>>  #include <asm/opal-api.h>
>>  #include <asm/processor.h>
>> +#include <asm/mm.h>
> 
> ... this addition look motivated when nothing else changes in this file.
>

See above.

> Also (I overlooked this in the earlier patch) all xen/ headers generally
> want to come ahead of all asm/ ones.
>

I'll keep that in mind going forward and for the next time I have a
valid reason to touch this file.

>> --- a/xen/arch/ppc/setup.c
>> +++ b/xen/arch/ppc/setup.c
>> @@ -7,6 +7,8 @@
>>  /* Xen stack for bringing up the first CPU. */
>>  unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE);
>>  
>> +void __init setup_initial_pagetables(void);
> 
> This needs to live in a header which is included both here and in the
> file defining the function.
>

I see I already declared the function's prototype in mm.h. This was
probably an oversight from before I added that. I'll drop the prototype
from setup.c and include asm/mm.h instead.

>> +static void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
>> +                                    unsigned int pid, unsigned int ric,
>> +                                    unsigned int prs)
>> +{
>> +    unsigned long rb;
>> +    unsigned long rs;
>> +
>> +    rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
>> +    rs = ((unsigned long) pid << PPC_BITLSHIFT(31));
>> +
>> +    asm volatile ( "tlbiel %0, %1, %2, %3, 1" :
>> +                  : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory" );
> 
> Nit: One too few indenting blanks (besides others that are missing).
> Perhaps also better to pull the standalone : onto this second line.
> Overall perhaps (also for line length)
> 
>     asm volatile ( "tlbiel %0, %1, %2, %3, 1"
>                    :: "r" (rb), "r" (rs), "i" (ric), "i" (prs)
>                    : "memory" );
>

Will do.

>> +void radix__tlbiel_all(unsigned int action)
> 
> Is the double underscore in here intentional?
>

It matches the original Linux file from which this routine was imported
(referenced in the file's top-level comment).

As I understand it, this is meant to indicate a private function that
shouldn't be called outside of radix-specific MMU or TLB code.  As
Linux's radix support code spans multiple files, a static function
wouldn't work for that.

> Jan

Thanks,
Shawn
Jan Beulich Aug. 10, 2023, 8:49 a.m. UTC | #3
On 09.08.2023 22:54, Shawn Anastasio wrote:
> On 8/1/23 8:18 AM, Jan Beulich wrote:
>> On 29.07.2023 00:21, Shawn Anastasio wrote:
>>> --- /dev/null
>>> +++ b/xen/arch/ppc/include/asm/page.h
>>> @@ -0,0 +1,178 @@
>>> +#ifndef _ASM_PPC_PAGE_H
>>> +#define _ASM_PPC_PAGE_H
>>> +
>>> +#include <xen/types.h>
>>> +
>>> +#include <asm/bitops.h>
>>> +#include <asm/byteorder.h>
>>> +
>>> +#define PDE_VALID     PPC_BIT(0)
>>> +#define PDE_NLB_MASK  0xfffffffffffffUL
>>> +#define PDE_NLB_SHIFT 5UL
>>> +#define PDE_NLS_MASK  0x1f
>>> +
>>> +#define PTE_VALID     PPC_BIT(0)
>>> +#define PTE_LEAF      PPC_BIT(1)
>>> +#define PTE_REFERENCE PPC_BIT(55)
>>> +#define PTE_CHANGE    PPC_BIT(56)
>>> +
>>> +/* PTE Attributes */
>>> +#define PTE_ATT_SAO            PPC_BIT(59) /* Strong Access Ordering */
>>> +#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58)
>>> +#define PTE_ATT_TOLERANT       (PPC_BIT(58) | PPC_BIT(59))
>>> +
>>> +/* PTE Encoded Access Authority*/
>>> +#define PTE_EAA_PRIVILEGED PPC_BIT(60)
>>> +#define PTE_EAA_READ       PPC_BIT(61)
>>> +#define PTE_EAA_WRITE      PPC_BIT(62)
>>> +#define PTE_EAA_EXECUTE    PPC_BIT(63)
>>> +
>>> +/* Field shifts/masks */
>>> +#define PTE_RPN_MASK  0x1fffffffffffUL
>>> +#define PTE_RPN_SHIFT 12UL
>>> +#define PTE_ATT_MASK  0x3UL
>>> +#define PTE_ATT_SHIFT 4UL
>>> +#define PTE_EAA_MASK  0xfUL
>>
>> Did you consider introducing just *_MASK values, utilizing MASK_INSR()
>> and MASK_EXTR() instead of open-coded shifting/masking?
>>
> 
> I actually wasn't aware of MASK_INSR/MASK_EXTR when writing this. I've
> just looked into it though, and I don't think using them makes the code
> much cleaner. Specifically I'm looking at the implementations of
> `pte_to_paddr` and `pde_to_paddr` which both need to ensure that the
> returned value retains its original shift.
> 
> For example, with pte_to_paddr, this change would be:
> -     return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT);
> +     return MASK_EXTR(be64_to_cpu(pte.pte), PTE_RPN_MASK) << PTE_RPN_SHIFT;
> 
> In addition to updating the definitions of the *_MASK macros to include
> the right-most padding zeros.
> 
> - #define PTE_RPN_MASK  0x1fffffffffffUL
> + #define PTE_RPN_MASK  0x1fffffffffff000UL

This is the important change. With that the above will be

     return be64_to_cpu(pte.pte) & PTE_RPN_MASK;

and in cases where you want the un-shifted value you'd use MASK_EXTR().

>>> +/*
>>> + * Calculate the index of the provided virtual address in the provided
>>> + * page table struct
>>> + */
>>> +#define pt_index(pt, va) _Generic((pt), \
>>
>> Earlier on I did ask about the minimum compiler version you require for
>> building the PPC hypervisor. Iirc you said about any, but here you're
>> using another feature where I'm not sure how far back it is available.
>> As indicated back then, it would be nice if ./README could gain
>> respective information.
>>
> 
> I'm locally building with gcc 10 (shipped with Debian 11), and the Xen
> CI image for ppc64le builds uses gcc 11 I believe. _Generic and the
> other features I'm using are certainly available further back, but I
> haven't personally tested anything earlier. If there's no objections,
> I'm tempted to just codify gcc 10 as the minimum supported compiler
> version and leave it up to users if they want to try testing on earlier
> versions.

Spelling out what you know works is good enough for starters. As you
say, people can propose updating when found too high.

>>> +    for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE )
>>> +    {
>>> +        struct lvl2_pd *lvl2;
>>> +        struct lvl3_pd *lvl3;
>>> +        struct lvl4_pt *lvl4;
>>> +        pde_t *pde;
>>> +        pte_t *pte;
>>> +
>>> +        /* Allocate LVL 2 PD if necessary */
>>> +        pde = pt_entry(lvl1, page_addr);
>>> +        if ( !pde_is_valid(*pde) )
>>> +        {
>>> +            lvl2 = lvl2_pd_pool_alloc();
>>> +            *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2);
>>
>> paddr_to_pde() doesn't mask off the top bits. Are you relying on
>> lvl2_pd_pool_alloc() using PIC addressing to get at the value it
>> then returns?
>>
> 
> I'm not sure I understand the question here. The pointer returned by
> lvl2_pd_pool_alloc() will indeed have the top address bits set in
> accordance with the link address, even before paging is enabled because
> of the relocation code and jump to XEN_VIRT_START in patch 2. This is
> fine though, since I call __pa() to strip off these bits before passing
> it to paddr_to_pde.

I'm sorry, I managed to overlook the __pa().

>> Also this and the similar lines below look to be a little too long.
> 
> Yeah, I noticed this as well, but my understanding was that lines longer
> than 80 columns were permitted in cases where they don't hurt
> readability. In any case, I can fix this.

There's only one general exception to line limit: printk() (and alike)
format strings want to live all on one line (unless there are embedded
\n), to allow reasonably grep-ing for them.

>>> +void radix__tlbiel_all(unsigned int action)
>>
>> Is the double underscore in here intentional?
>>
> 
> It matches the original Linux file from which this routine was imported
> (referenced in the file's top-level comment).
> 
> As I understand it, this is meant to indicate a private function that
> shouldn't be called outside of radix-specific MMU or TLB code.  As
> Linux's radix support code spans multiple files, a static function
> wouldn't work for that.

We haven't adopted that style yet(?), but I'm also not opposed to you
doing locally.

Jan
diff mbox series

Patch

diff --git a/xen/arch/ppc/Makefile b/xen/arch/ppc/Makefile
index 0c0a7884a1..a059ac4c0a 100644
--- a/xen/arch/ppc/Makefile
+++ b/xen/arch/ppc/Makefile
@@ -2,8 +2,10 @@  obj-$(CONFIG_PPC64) += ppc64/
 
 obj-y += boot-of.init.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.init.o
+obj-y += mm-radix.o
 obj-y += opal.o
 obj-y += setup.o
+obj-y += tlb-radix.o
 
 $(TARGET): $(TARGET)-syms
 	cp -f $< $@
diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h
new file mode 100644
index 0000000000..a7cd8ec7c5
--- /dev/null
+++ b/xen/arch/ppc/include/asm/bitops.h
@@ -0,0 +1,11 @@ 
+#ifndef _ASM_PPC_BITOPS_H
+#define _ASM_PPC_BITOPS_H
+
+#include <xen/compiler.h>
+
+/* PPC bit number conversion */
+#define PPC_BITLSHIFT(be)	(BITS_PER_LONG - 1 - (be))
+#define PPC_BIT(bit)		(1UL << PPC_BITLSHIFT(bit))
+#define PPC_BITMASK(bs, be)	((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs))
+
+#endif /* _ASM_PPC_BITOPS_H */
diff --git a/xen/arch/ppc/include/asm/mm.h b/xen/arch/ppc/include/asm/mm.h
new file mode 100644
index 0000000000..36e44a4356
--- /dev/null
+++ b/xen/arch/ppc/include/asm/mm.h
@@ -0,0 +1,19 @@ 
+#ifndef _ASM_PPC_MM_H
+#define _ASM_PPC_MM_H
+
+#include <asm/config.h>
+#include <asm/page-bits.h>
+
+#define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
+#define paddr_to_pfn(pa)  ((unsigned long)((pa) >> PAGE_SHIFT))
+
+#define virt_to_maddr(va) ((paddr_t)((vaddr_t)(va) & PADDR_MASK))
+#define maddr_to_virt(pa) ((void *)((paddr_t)(pa) | XEN_VIRT_START)
+
+/* Convert between Xen-heap virtual addresses and machine addresses. */
+#define __pa(x)             (virt_to_maddr(x))
+#define __va(x)             (maddr_to_virt(x))
+
+void setup_initial_pagetables(void);
+
+#endif /* _ASM_PPC_MM_H */
diff --git a/xen/arch/ppc/include/asm/page-bits.h b/xen/arch/ppc/include/asm/page-bits.h
index 4c01bf9716..0286177520 100644
--- a/xen/arch/ppc/include/asm/page-bits.h
+++ b/xen/arch/ppc/include/asm/page-bits.h
@@ -2,6 +2,7 @@ 
 #define __PPC_PAGE_BITS_H__
 
 #define PAGE_SHIFT              16 /* 64 KiB Pages */
-#define PADDR_BITS              48
+#define PADDR_BITS              53
+#define VADDR_BITS              52
 
 #endif /* __PPC_PAGE_BITS_H__ */
diff --git a/xen/arch/ppc/include/asm/page.h b/xen/arch/ppc/include/asm/page.h
new file mode 100644
index 0000000000..e58b0a7354
--- /dev/null
+++ b/xen/arch/ppc/include/asm/page.h
@@ -0,0 +1,178 @@ 
+#ifndef _ASM_PPC_PAGE_H
+#define _ASM_PPC_PAGE_H
+
+#include <xen/types.h>
+
+#include <asm/bitops.h>
+#include <asm/byteorder.h>
+
+#define PDE_VALID     PPC_BIT(0)
+#define PDE_NLB_MASK  0xfffffffffffffUL
+#define PDE_NLB_SHIFT 5UL
+#define PDE_NLS_MASK  0x1f
+
+#define PTE_VALID     PPC_BIT(0)
+#define PTE_LEAF      PPC_BIT(1)
+#define PTE_REFERENCE PPC_BIT(55)
+#define PTE_CHANGE    PPC_BIT(56)
+
+/* PTE Attributes */
+#define PTE_ATT_SAO            PPC_BIT(59) /* Strong Access Ordering */
+#define PTE_ATT_NON_IDEMPOTENT PPC_BIT(58)
+#define PTE_ATT_TOLERANT       (PPC_BIT(58) | PPC_BIT(59))
+
+/* PTE Encoded Access Authority*/
+#define PTE_EAA_PRIVILEGED PPC_BIT(60)
+#define PTE_EAA_READ       PPC_BIT(61)
+#define PTE_EAA_WRITE      PPC_BIT(62)
+#define PTE_EAA_EXECUTE    PPC_BIT(63)
+
+/* Field shifts/masks */
+#define PTE_RPN_MASK  0x1fffffffffffUL
+#define PTE_RPN_SHIFT 12UL
+#define PTE_ATT_MASK  0x3UL
+#define PTE_ATT_SHIFT 4UL
+#define PTE_EAA_MASK  0xfUL
+
+#define PTE_XEN_BASE (PTE_VALID | PTE_EAA_PRIVILEGED | PTE_REFERENCE)
+#define PTE_XEN_RW   (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_WRITE | PTE_CHANGE)
+#define PTE_XEN_RO   (PTE_XEN_BASE | PTE_EAA_READ)
+#define PTE_XEN_RX   (PTE_XEN_BASE | PTE_EAA_READ | PTE_EAA_EXECUTE)
+
+/*
+ * Radix Tree layout for 64KB pages:
+ *
+ * [ L1 (ROOT) PAGE DIRECTORY (8192 * sizeof(pde_t)) ]
+ *                     |
+ *                     |
+ *                     v
+ *    [ L2 PAGE DIRECTORY (512 * sizeof(pde_t)) ]
+ *                     |
+ *                     |
+ *                     v
+ *    [ L3 PAGE DIRECTORY (512 * sizeof(pde_t)) ]
+ *                     |
+ *                     |
+ *                     v
+ *      [ L4 PAGE TABLE (32 * sizeof(pte_t)) ]
+ *                     |
+ *                     |
+ *                     v
+ *            [ PAGE TABLE ENTRY ]
+ */
+
+#define XEN_PT_ENTRIES_LOG2_LVL_1 13 /* 2**13 entries, maps 2**13 * 512GB = 4PB */
+#define XEN_PT_ENTRIES_LOG2_LVL_2 9  /* 2**9  entries, maps 2**9  * 1GB = 512GB */
+#define XEN_PT_ENTRIES_LOG2_LVL_3 9  /* 2**9  entries, maps 2**9  * 1GB = 512GB */
+#define XEN_PT_ENTRIES_LOG2_LVL_4 5  /* 2**5  entries, maps 2**5  * 64K = 2MB */
+
+#define XEN_PT_SHIFT_LVL_1    (XEN_PT_SHIFT_LVL_2 + XEN_PT_ENTRIES_LOG2_LVL_2)
+#define XEN_PT_SHIFT_LVL_2    (XEN_PT_SHIFT_LVL_3 + XEN_PT_ENTRIES_LOG2_LVL_3)
+#define XEN_PT_SHIFT_LVL_3    (XEN_PT_SHIFT_LVL_4 + XEN_PT_ENTRIES_LOG2_LVL_4)
+#define XEN_PT_SHIFT_LVL_4    PAGE_SHIFT
+
+#define XEN_PT_ENTRIES_LOG2_LVL(lvl) (XEN_PT_ENTRIES_LOG2_LVL_##lvl)
+#define XEN_PT_SHIFT_LVL(lvl)        (XEN_PT_SHIFT_LVL_##lvl)
+#define XEN_PT_ENTRIES_LVL(lvl)      (1UL << XEN_PT_ENTRIES_LOG2_LVL(lvl))
+#define XEN_PT_MASK_LVL(lvl)         (XEN_PT_ENTRIES_LVL(lvl) - 1)
+#define XEN_PT_INDEX_LVL(lvl, va)    ((va >> XEN_PT_SHIFT_LVL(lvl)) & XEN_PT_MASK_LVL(lvl))
+
+/*
+ * Calculate the index of the provided virtual address in the provided
+ * page table struct
+ */
+#define pt_index(pt, va) _Generic((pt), \
+    struct lvl1_pd * : XEN_PT_INDEX_LVL(1, (va)), \
+    struct lvl2_pd * : XEN_PT_INDEX_LVL(2, (va)), \
+    struct lvl3_pd * : XEN_PT_INDEX_LVL(3, (va)), \
+    struct lvl4_pt * : XEN_PT_INDEX_LVL(4, (va)))
+
+#define pt_entry(pt, va) (&((pt)->entries[pt_index((pt), (va))]))
+
+typedef struct
+{
+    __be64 pde;
+} pde_t;
+
+typedef struct
+{
+    __be64 pte;
+} pte_t;
+
+struct lvl1_pd
+{
+    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_1];
+};
+
+struct lvl2_pd
+{
+    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_2];
+};
+
+struct lvl3_pd
+{
+    pde_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_3];
+};
+
+struct lvl4_pt
+{
+    pte_t entries[1U << XEN_PT_ENTRIES_LOG2_LVL_4];
+};
+
+static inline pte_t paddr_to_pte(paddr_t paddr, unsigned long flags)
+{
+    return (pte_t){ .pte = cpu_to_be64(paddr | flags | PTE_LEAF) };
+}
+
+static inline pde_t paddr_to_pde(paddr_t paddr, unsigned long flags, unsigned long nls)
+{
+    return (pde_t){ .pde = cpu_to_be64(paddr | flags | nls) };
+}
+
+static inline paddr_t pte_to_paddr(pte_t pte)
+{
+    return be64_to_cpu(pte.pte) & (PTE_RPN_MASK << PTE_RPN_SHIFT);
+}
+
+static inline paddr_t pde_to_paddr(pde_t pde)
+{
+    return be64_to_cpu(pde.pde) & (PDE_NLB_MASK << PDE_NLB_SHIFT);
+}
+
+static inline bool pte_is_valid(pte_t pte)
+{
+    return pte.pte & be64_to_cpu(PTE_VALID);
+}
+
+static inline bool pde_is_valid(pde_t pde)
+{
+    return pde.pde & be64_to_cpu(PDE_VALID);
+}
+
+/*
+ * ISA 3.0 partition and process table entry format
+ */
+struct patb_entry {
+	__be64 patb0;
+	__be64 patb1;
+};
+#define PATB0_HR PPC_BIT(0) /* host uses radix */
+#define PATB1_GR PPC_BIT(0) /* guest uses radix; must match HR */
+
+struct prtb_entry {
+	__be64 prtb0;
+	__be64 reserved;
+};
+
+/*
+ * We support 52 bits, hence:
+ * bits 52 - 31 = 21, 0b10101
+ * RTS encoding details
+ * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
+ * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
+ */
+#define RTS_FIELD ((0x2UL << 61) | (0x5UL << 5))
+
+void tlbie_all(void);
+
+#endif /* _ASM_PPC_PAGE_H */
diff --git a/xen/arch/ppc/include/asm/processor.h b/xen/arch/ppc/include/asm/processor.h
index 2300640787..417288738c 100644
--- a/xen/arch/ppc/include/asm/processor.h
+++ b/xen/arch/ppc/include/asm/processor.h
@@ -133,6 +133,37 @@  struct cpu_user_regs
     uint32_t entry_vector;
 };
 
+static __inline__ void sync(void)
+{
+    asm volatile ( "sync" );
+}
+
+static __inline__ void isync(void)
+{
+    asm volatile ( "isync" );
+}
+
+static inline unsigned long mfmsr(void)
+{
+    unsigned long msr;
+    asm volatile ( "mfmsr %0" : "=&r"(msr) );
+    return msr;
+}
+
+static inline void mtmsrd(unsigned long msr)
+{
+    asm volatile ( "mtmsrd %0" : : "r"(msr) );
+}
+
+#define mtspr(reg, val) asm volatile ( "mtspr %0,%1" : : "K"(reg), "r"(val) )
+
+#define mfspr(reg)                                                             \
+    ({                                                                         \
+        unsigned long val;                                                     \
+        asm volatile ( "mfspr %0,%1" : "=r"(val) : "K"(reg) );                 \
+        val;                                                                   \
+    })
+
 /*
  * panic() isn't available at the moment so an infinite loop will be
  * used temporarily.
diff --git a/xen/arch/ppc/include/asm/regs.h b/xen/arch/ppc/include/asm/regs.h
new file mode 100644
index 0000000000..d5f47a6ff1
--- /dev/null
+++ b/xen/arch/ppc/include/asm/regs.h
@@ -0,0 +1,138 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2005, 2007
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_REG_DEFS_H_
+#define _ASM_REG_DEFS_H_
+
+/* Special Purpose Registers */
+#define SPRN_VRSAVE 256
+#define SPRN_DSISR  18
+#define SPRN_DAR    19
+#define SPRN_DEC    22
+#define SPRN_SRR0   26
+#define SPRN_SRR1   27
+#define SPRN_TBRL   268
+#define SPRN_TBRU   269
+#define SPRN_SPRG0  272
+#define SPRN_SPRG1  273
+#define SPRN_SPRG2  274
+#define SPRN_SPRG3  275
+#define SPRN_TBWL   284
+#define SPRN_TBWU   285
+
+#define SPRN_HSPRG0 304
+#define SPRN_HSPRG1 305
+#define SPRN_HDEC   310
+#define SPRN_HIOR   311
+#define SPRN_RMOR   312
+#define SPRN_HRMOR  313
+#define SPRN_HSRR0  314
+#define SPRN_HSRR1  315
+#define SPRN_LPIDR  319
+
+#define SPRN_PTCR	0x1D0	/* Partition table control Register */
+#define SPRN_PID	0x030	/* Process ID */
+
+/* Performance monitor spr encodings */
+#define SPRN_MMCRA  786
+#define   MMCRA_SAMPHV    _AC(0x10000000, UL) /* state of MSR HV when SIAR set */
+#define   MMCRA_SAMPPR    _AC(0x08000000, UL) /* state of MSR PR when SIAR set */
+#define   MMCRA_SAMPLE_ENABLE _AC(0x00000001, UL) /* enable sampling */
+#define NUM_PMCS 8
+#define SPRN_PMC1   787
+#define SPRN_PMC2   788
+#define SPRN_PMC3   789
+#define SPRN_PMC4   790
+#define SPRN_PMC5   791
+#define SPRN_PMC6   792
+#define SPRN_PMC7   793
+#define SPRN_PMC8   794
+#define SPRN_MMCR0  795
+#define   MMCR0_FC      _AC(0x80000000, UL) /* freeze counters */
+#define   MMCR0_FCS     _AC(0x40000000, UL) /* freeze in supervisor state */
+#define   MMCR0_FCP     _AC(0x20000000, UL) /* freeze in problem state */
+#define   MMCR0_FCM1    _AC(0x10000000, UL) /* freeze counters while MSR mark = 1 */
+#define   MMCR0_FCM0    _AC(0x08000000, UL) /* freeze counters while MSR mark = 0 */
+#define   MMCR0_PMAE    _AC(0x04000000, UL) /* performance monitor alert enabled */
+#define   MMCR0_PMAO    _AC(0x00000080, UL) /* performance monitor alert occurred */
+#define   MMCR0_FCH     _AC(0x00000001, UL) /* freeze conditions in hypervisor */
+#define SPRN_SIAR   796
+#define SPRN_SDAR   797
+#define SPRN_MMCR1  798
+
+/* As defined for PU G4 */
+#define SPRN_HID0   1008
+#define SPRN_HID1   1009
+#define SPRN_HID4   1012
+
+#define SPRN_DABR   1013
+#define SPRN_HID5   1014
+#define SPRN_DABRX  1015
+#define SPRN_HID6   1017
+#define SPRN_HID7   1018
+#define SPRN_HID8   1019
+#define SPRN_PIR    1023
+
+#define SPRN_LPCR	0x13E	/* LPAR Control Register */
+#define   LPCR_VPM0		_AC(0x8000000000000000, UL)
+#define   LPCR_VPM1		_AC(0x4000000000000000, UL)
+#define   LPCR_ISL		_AC(0x2000000000000000, UL)
+#define   LPCR_VC_SH		61
+#define   LPCR_DPFD_SH		52
+#define   LPCR_DPFD		(_AC(7, UL) << LPCR_DPFD_SH)
+#define   LPCR_VRMASD_SH	47
+#define   LPCR_VRMASD		(_AC(0x1f, UL) << LPCR_VRMASD_SH)
+#define   LPCR_VRMA_L		_AC(0x0008000000000000, UL)
+#define   LPCR_VRMA_LP0		_AC(0x0001000000000000, UL)
+#define   LPCR_VRMA_LP1		_AC(0x0000800000000000, UL)
+#define   LPCR_RMLS		0x1C000000	/* Implementation dependent RMO limit sel */
+#define   LPCR_RMLS_SH		26
+#define   LPCR_HAIL		_AC(0x0000000004000000, UL)   /* HV AIL (ISAv3.1) */
+#define   LPCR_ILE		_AC(0x0000000002000000, UL)   /* !HV irqs set MSR:LE */
+#define   LPCR_AIL		_AC(0x0000000001800000, UL)	/* Alternate interrupt location */
+#define   LPCR_AIL_0		_AC(0x0000000000000000, UL)	/* MMU off exception offset 0x0 */
+#define   LPCR_AIL_3		_AC(0x0000000001800000, UL)   /* MMU on exception offset 0xc00...4xxx */
+#define   LPCR_ONL		_AC(0x0000000000040000, UL)	/* online - PURR/SPURR count */
+#define   LPCR_LD		_AC(0x0000000000020000, UL)	/* large decremeter */
+#define   LPCR_PECE		_AC(0x000000000001f000, UL)	/* powersave exit cause enable */
+#define     LPCR_PECEDP	_AC(0x0000000000010000, UL)	/* directed priv dbells cause exit */
+#define     LPCR_PECEDH	_AC(0x0000000000008000, UL)	/* directed hyp dbells cause exit */
+#define     LPCR_PECE0		_AC(0x0000000000004000, UL)	/* ext. exceptions can cause exit */
+#define     LPCR_PECE1		_AC(0x0000000000002000, UL)	/* decrementer can cause exit */
+#define     LPCR_PECE2		_AC(0x0000000000001000, UL)	/* machine check etc can cause exit */
+#define     LPCR_PECE_HVEE	_AC(0x0000400000000000, UL)	/* P9 Wakeup on HV interrupts */
+#define   LPCR_MER		_AC(0x0000000000000800, UL)	/* Mediated External Exception */
+#define   LPCR_MER_SH		11
+#define	  LPCR_GTSE		_AC(0x0000000000000400, UL)  	/* Guest Translation Shootdown Enable */
+#define   LPCR_TC		_AC(0x0000000000000200, UL)	/* Translation control */
+#define   LPCR_HEIC		_AC(0x0000000000000010, UL)   /* Hypervisor External Interrupt Control */
+#define   LPCR_LPES		0x0000000c
+#define   LPCR_LPES0		_AC(0x0000000000000008, UL)      /* LPAR Env selector 0 */
+#define   LPCR_LPES1		_AC(0x0000000000000004, UL)      /* LPAR Env selector 1 */
+#define   LPCR_LPES_SH		2
+#define   LPCR_RMI		_AC(0x0000000000000002, UL)      /* real mode is cache inhibit */
+#define   LPCR_HVICE		_AC(0x0000000000000002, UL)      /* P9: HV interrupt enable */
+#define   LPCR_HDICE		_AC(0x0000000000000001, UL)      /* Hyp Decr enable (HV,PR,EE) */
+#define   LPCR_UPRT		_AC(0x0000000000400000, UL)      /* Use Process Table (ISA 3) */
+#define   LPCR_HR		_AC(0x0000000000100000, UL)
+
+#endif /* _ASM_REG_DEFS_H_ */
diff --git a/xen/arch/ppc/include/asm/types.h b/xen/arch/ppc/include/asm/types.h
index cee08e111a..325a2a27c3 100644
--- a/xen/arch/ppc/include/asm/types.h
+++ b/xen/arch/ppc/include/asm/types.h
@@ -15,6 +15,7 @@  typedef unsigned int u32;
 typedef signed long s64;
 typedef unsigned long u64;
 typedef unsigned long paddr_t;
+typedef unsigned long vaddr_t;
 #define INVALID_PADDR (~0UL)
 #define PRIpaddr "016lx"
 
diff --git a/xen/arch/ppc/mm-radix.c b/xen/arch/ppc/mm-radix.c
new file mode 100644
index 0000000000..071e71b73e
--- /dev/null
+++ b/xen/arch/ppc/mm-radix.c
@@ -0,0 +1,268 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/types.h>
+#include <xen/lib.h>
+
+#include <asm/bitops.h>
+#include <asm/byteorder.h>
+#include <asm/early_printk.h>
+#include <asm/mm.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/regs.h>
+#include <asm/msr.h>
+
+void enable_mmu(void);
+
+#define INITIAL_LVL1_PD_COUNT      1
+#define INITIAL_LVL2_LVL3_PD_COUNT 2
+#define INITIAL_LVL4_PT_COUNT      256
+
+static size_t initial_lvl1_pd_pool_used;
+static struct lvl1_pd __aligned(sizeof(struct lvl1_pd))
+initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT];
+
+static size_t initial_lvl2_lvl3_pd_pool_used;
+static struct lvl2_pd __aligned(sizeof(struct lvl2_pd))
+initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT];
+
+static size_t initial_lvl4_pt_pool_used;
+static struct lvl4_pt __aligned(sizeof(struct lvl4_pt))
+initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT];
+
+/* Only reserve minimum Partition and Process tables  */
+#define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 */
+#define PATB_SIZE      (1UL << PATB_SIZE_LOG2)
+#define PRTB_SIZE_LOG2 12
+#define PRTB_SIZE      (1UL << PRTB_SIZE_LOG2)
+
+static struct patb_entry
+    __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct patb_entry)];
+
+static struct prtb_entry
+    __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct prtb_entry)];
+
+_Static_assert(sizeof(initial_patb) == PATB_SIZE);
+
+static __init struct lvl1_pd *lvl1_pd_pool_alloc(void)
+{
+    if ( initial_lvl1_pd_pool_used >= INITIAL_LVL1_PD_COUNT )
+    {
+        early_printk("Ran out of space for LVL1 PD!\n");
+        die();
+    }
+
+    return &initial_lvl1_pd_pool[initial_lvl1_pd_pool_used++];
+}
+
+static __init struct lvl2_pd *lvl2_pd_pool_alloc(void)
+{
+    if ( initial_lvl2_lvl3_pd_pool_used >= INITIAL_LVL2_LVL3_PD_COUNT )
+    {
+        early_printk("Ran out of space for LVL2/3 PD!\n");
+        die();
+    }
+
+    return &initial_lvl2_lvl3_pd_pool[initial_lvl2_lvl3_pd_pool_used++];
+}
+
+static __init struct lvl3_pd *lvl3_pd_pool_alloc(void)
+{
+    BUILD_BUG_ON(sizeof(struct lvl3_pd) != sizeof(struct lvl2_pd));
+
+    return (struct lvl3_pd *) lvl2_pd_pool_alloc();
+}
+
+static __init struct lvl4_pt *lvl4_pt_pool_alloc(void)
+{
+    if ( initial_lvl4_pt_pool_used >= INITIAL_LVL4_PT_COUNT )
+    {
+        early_printk("Ran out of space for LVL4 PT!\n");
+        die();
+    }
+
+    return &initial_lvl4_pt_pool[initial_lvl4_pt_pool_used++];
+}
+
+#ifndef NDEBUG
+/* TODO: Remove once we get common/ building */
+static char *__init itoa64_hex(uint64_t val, char *out_buf, size_t buf_len)
+{
+    uint64_t cur;
+    size_t i = buf_len - 1;
+
+    /* Null terminate buffer */
+    out_buf[i] = '\0';
+
+    /* Add digits in reverse */
+    cur = val;
+    while ( cur && i > 0 )
+    {
+        out_buf[--i] = "0123456789ABCDEF"[cur % 16];
+        cur /= 16;
+    }
+
+    /* Pad to 16 digits */
+    while ( i > 0 )
+        out_buf[--i] = '0';
+
+    return out_buf + i;
+}
+#endif
+
+static void __init radix_dprint(uint64_t addr, const char *msg)
+{
+#ifndef NDEBUG
+    char buf[sizeof("DEADBEEFCAFEBABA")];
+    char *addr_s = itoa64_hex(addr, buf, sizeof(buf));
+
+    early_printk("(0x");
+    early_printk(addr_s);
+    early_printk(") ");
+    early_printk(msg);
+#endif
+}
+
+static void __init setup_initial_mapping(struct lvl1_pd *lvl1,
+                                         vaddr_t map_start,
+                                         vaddr_t map_end,
+                                         paddr_t phys_base)
+{
+    uint64_t page_addr;
+
+    if ( map_start & ~PAGE_MASK )
+    {
+        early_printk("Xen _start be aligned to 64k (PAGE_SIZE) boundary\n");
+        die();
+    }
+
+    if ( (uint64_t) phys_base & ~PAGE_MASK )
+    {
+        early_printk("Xen should be loaded at 64k (PAGE_SIZE) boundary\n");
+        die();
+    }
+
+    for ( page_addr = map_start; page_addr < map_end; page_addr += PAGE_SIZE )
+    {
+        struct lvl2_pd *lvl2;
+        struct lvl3_pd *lvl3;
+        struct lvl4_pt *lvl4;
+        pde_t *pde;
+        pte_t *pte;
+
+        /* Allocate LVL 2 PD if necessary */
+        pde = pt_entry(lvl1, page_addr);
+        if ( !pde_is_valid(*pde) )
+        {
+            lvl2 = lvl2_pd_pool_alloc();
+            *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_2);
+        }
+        else
+            lvl2 = (struct lvl2_pd *) pde_to_paddr(*pde);
+
+        /* Allocate LVL 3 PD if necessary */
+        pde = pt_entry(lvl2, page_addr);
+        if ( !pde_is_valid(*pde) )
+        {
+            lvl3 = lvl3_pd_pool_alloc();
+            *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_3);
+        }
+        else
+            lvl3 = (struct lvl3_pd *) pde_to_paddr(*pde);
+
+        /* Allocate LVL 4 PT if necessary */
+        pde = pt_entry(lvl3, page_addr);
+        if ( !pde_is_valid(*pde) )
+        {
+            lvl4 = lvl4_pt_pool_alloc();
+            *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, XEN_PT_ENTRIES_LOG2_LVL_4);
+        }
+        else
+            lvl4 = (struct lvl4_pt *) pde_to_paddr(*pde);
+
+        /* Finally, create PTE in LVL 4 PT */
+        pte = pt_entry(lvl4, page_addr);
+        if ( !pte_is_valid(*pte) )
+        {
+            unsigned long paddr = (page_addr - map_start) + phys_base;
+            unsigned long flags;
+            radix_dprint(paddr, "being mapped to ");
+            radix_dprint(page_addr, "!\n");
+            if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) )
+            {
+                radix_dprint(page_addr, "being marked as TEXT (RX)\n");
+                flags = PTE_XEN_RX;
+            }
+            else if ( is_kernel_rodata(page_addr) )
+            {
+                radix_dprint(page_addr, "being marked as RODATA (RO)\n");
+                flags = PTE_XEN_RO;
+            }
+            else
+            {
+                radix_dprint(page_addr, "being marked as DEFAULT (RW)\n");
+                flags = PTE_XEN_RW;
+            }
+
+            *pte = paddr_to_pte(paddr, flags);
+            radix_dprint(paddr_to_pte(paddr, flags).pte,
+                             "is result of PTE map!\n");
+        }
+        else
+        {
+            early_printk("BUG: Tried to create PTE for already-mapped page!");
+            die();
+        }
+    }
+}
+
+static void setup_partition_table(struct patb_entry *patb, struct lvl1_pd *root)
+{
+    unsigned long ptcr;
+
+    /* Configure entry for LPID 0 to enable Radix and point to root PD */
+    uint64_t patb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1 | PATB0_HR;
+    uint64_t patb1 = __pa(initial_prtb) | (PRTB_SIZE_LOG2 - 12) | PATB1_GR;
+    patb[0].patb0 = cpu_to_be64(patb0);
+    patb[0].patb1 = cpu_to_be64(patb1);
+
+    ptcr = __pa(initial_patb) | (PATB_SIZE_LOG2 - 12);
+    mtspr(SPRN_PTCR, ptcr);
+}
+
+static void setup_process_table(struct prtb_entry *prtb, struct lvl1_pd *root)
+{
+    /* Configure entry for PID 0 to point to root PD */
+    uint64_t prtb0 = RTS_FIELD | __pa(root) | XEN_PT_ENTRIES_LOG2_LVL_1;
+    initial_prtb[0].prtb0 = cpu_to_be64(prtb0);
+}
+
+void __init setup_initial_pagetables(void)
+{
+    struct lvl1_pd *root = lvl1_pd_pool_alloc();
+    unsigned long lpcr;
+
+    setup_initial_mapping(root,
+                         (vaddr_t) _start,
+                         (vaddr_t) _end,
+                          __pa(_start));
+
+    /* Enable Radix mode in LPCR */
+    lpcr = mfspr(SPRN_LPCR);
+    mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+    early_printk("Enabled radix in LPCR\n");
+
+    /* Set up initial process table */
+    setup_process_table(initial_prtb, root);
+
+    /* Set up initial partition table */
+    setup_partition_table(initial_patb, root);
+
+    /* Flush TLB */
+    tlbie_all();
+    early_printk("Flushed TLB\n");
+
+    /* Turn on the MMU */
+    enable_mmu();
+}
diff --git a/xen/arch/ppc/opal.c b/xen/arch/ppc/opal.c
index cc75ccfcbe..727f8726fa 100644
--- a/xen/arch/ppc/opal.c
+++ b/xen/arch/ppc/opal.c
@@ -1,8 +1,10 @@ 
 /* SPDX-License-Identifier: GPL-2.0-or-later */
+#include "xen/compiler.h"
 #include <asm/boot.h>
 #include <asm/early_printk.h>
 #include <asm/opal-api.h>
 #include <asm/processor.h>
+#include <asm/mm.h>
 #include <xen/types.h>
 #include <xen/libfdt/libfdt.h>
 #include <xen/init.h>
diff --git a/xen/arch/ppc/ppc64/head.S b/xen/arch/ppc/ppc64/head.S
index beff8257fa..5ae85ae123 100644
--- a/xen/arch/ppc/ppc64/head.S
+++ b/xen/arch/ppc/ppc64/head.S
@@ -2,6 +2,7 @@ 
 
 #include <asm/asm-defns.h>
 #include <asm/asm-offsets.h>
+#include <asm/msr.h>
 
     .section .text.header, "ax", %progbits
 
@@ -67,3 +68,17 @@  ENTRY(start)
 
     .size start, . - start
     .type start, %function
+
+ENTRY(enable_mmu)
+    mflr %r3
+    mfmsr %r4
+
+    /* enable instruction and data relocation (MMU) */
+    ori %r4, %r4, (MSR_IR | MSR_DR)
+
+    mtsrr0 %r3 /* return to caller after MMU enable */
+    mtsrr1 %r4
+    rfid
+
+    .size enable_mmu, . - enable_mmu
+    .type enable_mmu, %function
diff --git a/xen/arch/ppc/setup.c b/xen/arch/ppc/setup.c
index bacd6d1acd..466993987b 100644
--- a/xen/arch/ppc/setup.c
+++ b/xen/arch/ppc/setup.c
@@ -7,6 +7,8 @@ 
 /* Xen stack for bringing up the first CPU. */
 unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __aligned(STACK_SIZE);
 
+void __init setup_initial_pagetables(void);
+
 void __init noreturn start_xen(unsigned long r3, unsigned long r4,
                                unsigned long r5, unsigned long r6,
                                unsigned long r7)
@@ -27,6 +29,8 @@  void __init noreturn start_xen(unsigned long r3, unsigned long r4,
         boot_opal_init((void *) r3);
     }
 
+    setup_initial_pagetables();
+
     early_printk("Hello, ppc64le!\n");
 
     for ( ; ; )
diff --git a/xen/arch/ppc/tlb-radix.c b/xen/arch/ppc/tlb-radix.c
new file mode 100644
index 0000000000..69934076a7
--- /dev/null
+++ b/xen/arch/ppc/tlb-radix.c
@@ -0,0 +1,95 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Based on arch/powerpc/mm/book3s64/radix_tlb.c from Linux with the following
+ * copyright notice:
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ */
+#include <xen/stringify.h>
+
+#include <asm/bitops.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+/* TLB flush actions. Used as argument to tlbiel_flush() */
+enum
+{
+    TLB_INVAL_SCOPE_LPID,   /* invalidate TLBs for current LPID */
+    TLB_INVAL_SCOPE_GLOBAL, /* invalidate all TLBs */
+};
+
+#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */
+
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
+static void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
+                                    unsigned int pid, unsigned int ric,
+                                    unsigned int prs)
+{
+    unsigned long rb;
+    unsigned long rs;
+
+    rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+    rs = ((unsigned long) pid << PPC_BITLSHIFT(31));
+
+    asm volatile ( "tlbiel %0, %1, %2, %3, 1" :
+                  : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory" );
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+    unsigned int set;
+
+    asm volatile ( "ptesync" : : : "memory" );
+
+    /*
+     * Flush the first set of the TLB, and the entire Page Walk Cache
+     * and partition table entries. Then flush the remaining sets of the
+     * TLB.
+     */
+
+    if ( mfmsr() & MSR_HV )
+    {
+        /* MSR[HV] should flush partition scope translations first. */
+        tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+
+        for ( set = 1; set < num_sets; set++ )
+            tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+    }
+
+    /* Flush process scoped entries. */
+    tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
+
+    for ( set = 1; set < num_sets; set++ )
+        tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
+
+    asm volatile ( "ptesync" : : : "memory" );
+}
+
+void radix__tlbiel_all(unsigned int action)
+{
+    unsigned int is;
+
+    switch ( action )
+    {
+    case TLB_INVAL_SCOPE_GLOBAL:
+        is = 3;
+        break;
+    case TLB_INVAL_SCOPE_LPID:
+        is = 2;
+        break;
+    default:
+        die();
+    }
+
+    tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
+
+    asm volatile ( "slbia 7; isync" : : : "memory" );
+}
+
+void tlbie_all(void)
+{
+    radix__tlbiel_all(TLB_INVAL_SCOPE_GLOBAL);
+}