diff mbox series

[v2,4/6] xen/riscv: introduce identity mapping

Message ID 21d8ce65f718bc10c2213688f79cf5f978bcaf16.1687178053.git.oleksii.kurochko@gmail.com (mailing list archive)
State Superseded
Headers show
Series xen/riscv: introduce identity mapping | expand

Commit Message

Oleksii June 19, 2023, 1:34 p.m. UTC
The way how switch to virtual address was implemented in the
commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
isn't safe enough as:
* enable_mmu() depends on hooking all exceptions
  and pagefault.
* Any exception other than pagefault, or not taking a pagefault
  causes it to malfunction, which means you will fail to boot
  depending on where Xen was loaded into memory.

Instead of the proposed way of switching to virtual addresses was
decided to use identity mapping of the entrire Xen and after
switching to virtual addresses identity mapping is removed from
page-tables.
Since it is not easy to keep track where the identity map was mapped,
so we will look for the top-level entry exclusive to the identity
map and remove it.

Fixes: e66003e7be ("xen/riscv: introduce setup_initial_pages")
Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes in V2:
  - update definition of PGTBL_INITIAL_COUNT and the comment above.
  - code style fixes.
  - 1:1 mapping for entire Xen.
  - remove id_addrs array becase entire Xen is mapped.
  - reverse condition for cycle inside remove_identity_mapping().
  - fix page table walk in remove_identity_mapping().
  - update the commit message.
  - add Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
  - save hart_id and dtb_addr before call MMU related C functions.
  - use phys_offset variable instead of doing calcultations to get phys offset
    in head.S file. ( it can be easily done as entire Xen is 1:1 mapped )
  - declare enable_muu() as __init.
---
 xen/arch/riscv/include/asm/config.h |  2 +
 xen/arch/riscv/include/asm/mm.h     |  3 +-
 xen/arch/riscv/mm.c                 | 84 ++++++++++++++++-------------
 xen/arch/riscv/riscv64/head.S       | 34 ++++++++++++
 xen/arch/riscv/setup.c              | 14 +----
 5 files changed, 88 insertions(+), 49 deletions(-)

Comments

Jan Beulich July 6, 2023, 11:35 a.m. UTC | #1
On 19.06.2023 15:34, Oleksii Kurochko wrote:
> Since it is not easy to keep track where the identity map was mapped,
> so we will look for the top-level entry exclusive to the identity
> map and remove it.

I think you mean "top-most" or some such, as it's not necessarily the
top-level entry you zap.

> --- a/xen/arch/riscv/include/asm/config.h
> +++ b/xen/arch/riscv/include/asm/config.h
> @@ -1,3 +1,5 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
>  #ifndef __RISCV_CONFIG_H__
>  #define __RISCV_CONFIG_H__
>  

Unrelated change?

> --- a/xen/arch/riscv/mm.c
> +++ b/xen/arch/riscv/mm.c
> @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
>  #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
>  #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
>  
> +/*
> + * Should be removed as soon as enough headers will be merged for inclusion of
> + * <xen/lib.h>.
> + */
> +#define ARRAY_SIZE(arr)		(sizeof(arr) / sizeof((arr)[0]))
> +
>  /*
>   * It is expected that Xen won't be more then 2 MB.
>   * The check in xen.lds.S guarantees that.
> @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
>   *
>   * It might be needed one more page table in case when Xen load address
>   * isn't 2 MB aligned.
> + *
> + * (CONFIG_PAGING_LEVELS - 1) page tables are needed for identity mapping.
>   */
> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1)

How come the extra page (see the comment sentence in context) isn't
needed for the identity-mapping case?

> @@ -255,25 +262,30 @@ void __init noreturn noinline enable_mmu()
>      csr_write(CSR_SATP,
>                PFN_DOWN((unsigned long)stage1_pgtbl_root) |
>                RV_STAGE1_MODE << SATP_MODE_SHIFT);
> +}
>  
> -    asm volatile ( ".p2align 2" );
> - mmu_is_enabled:
> -    /*
> -     * Stack should be re-inited as:
> -     * 1. Right now an address of the stack is relative to load time
> -     *    addresses what will cause an issue in case of load start address
> -     *    isn't equal to linker start address.
> -     * 2. Addresses in stack are all load time relative which can be an
> -     *    issue in case when load start address isn't equal to linker
> -     *    start address.
> -     *
> -     * We can't return to the caller because the stack was reseted
> -     * and it may have stash some variable on the stack.
> -     * Jump to a brand new function as the stack was reseted
> -     */
> +void __init remove_identity_mapping(void)
> +{
> +    unsigned int i;
> +    pte_t *pgtbl;
> +    unsigned int index, xen_index;
> +    unsigned long load_addr = LINK_TO_LOAD(_start);
>  
> -    switch_stack_and_jump((unsigned long)cpu0_boot_stack + STACK_SIZE,
> -                          cont_after_mmu_is_enabled);
> +    for ( pgtbl = stage1_pgtbl_root, i = 0;
> +          i <= (CONFIG_PAGING_LEVELS - 1);

i < CONFIG_PAGING_LEVELS ? But maybe it would be easier for i to start
at CONFIG_PAGING_LEVELS and be decremented, simplifying ...

> +          i++ )
> +    {
> +        index = pt_index(CONFIG_PAGING_LEVELS - 1 - i, load_addr);
> +        xen_index = pt_index(CONFIG_PAGING_LEVELS - 1 - i, XEN_VIRT_START);

... these two expressions?

> +        if ( index != xen_index )
> +        {
> +            pgtbl[index].pte = 0;
> +            break;
> +        }

Is this enough? When load and link address are pretty close (but not
overlapping), can't they share a leaf table, in which case you need
to clear more than just a single entry? The present overlap check
looks to be 4k-granular, not 2M (in which case this couldn't happen;
IOW adjusting the overlap check may also be a way out).

Jan
Oleksii July 7, 2023, 10:37 a.m. UTC | #2
On Thu, 2023-07-06 at 13:35 +0200, Jan Beulich wrote:
> On 19.06.2023 15:34, Oleksii Kurochko wrote:
> > Since it is not easy to keep track where the identity map was
> > mapped,
> > so we will look for the top-level entry exclusive to the identity
> > map and remove it.
> 
> I think you mean "top-most" or some such, as it's not necessarily the
> top-level entry you zap.
Yeah, 'top-most' is more appropriate in this context.
> 
> > --- a/xen/arch/riscv/include/asm/config.h
> > +++ b/xen/arch/riscv/include/asm/config.h
> > @@ -1,3 +1,5 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> >  #ifndef __RISCV_CONFIG_H__
> >  #define __RISCV_CONFIG_H__
> >  
> 
> Unrelated change?
It  should be part of [PATCH v2 5/6] xen/riscv: introduce identity
mapping.

> 
> > --- a/xen/arch/riscv/mm.c
> > +++ b/xen/arch/riscv/mm.c
> > @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
> >  #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
> >  #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
> >  
> > +/*
> > + * Should be removed as soon as enough headers will be merged for
> > inclusion of
> > + * <xen/lib.h>.
> > + */
> > +#define ARRAY_SIZE(arr)                (sizeof(arr) /
> > sizeof((arr)[0]))
> > +
> >  /*
> >   * It is expected that Xen won't be more then 2 MB.
> >   * The check in xen.lds.S guarantees that.
> > @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
> >   *
> >   * It might be needed one more page table in case when Xen load
> > address
> >   * isn't 2 MB aligned.
> > + *
> > + * (CONFIG_PAGING_LEVELS - 1) page tables are needed for identity
> > mapping.
> >   */
> > -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> > +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1)
> 
> How come the extra page (see the comment sentence in context) isn't
> needed for the identity-mapping case?
It is needed to allocate no more than two 'nonroot' page tables (L0 and
L1 in case of Sv39 ) as page 'root' table ( L2 in case of Sv39 ) is
always re-used.

The same ( only 'nonroot' page tables might be needed to allocate )
works for any MMU mode.

> 
> > @@ -255,25 +262,30 @@ void __init noreturn noinline enable_mmu()
> >      csr_write(CSR_SATP,
> >                PFN_DOWN((unsigned long)stage1_pgtbl_root) |
> >                RV_STAGE1_MODE << SATP_MODE_SHIFT);
> > +}
> >  
> > -    asm volatile ( ".p2align 2" );
> > - mmu_is_enabled:
> > -    /*
> > -     * Stack should be re-inited as:
> > -     * 1. Right now an address of the stack is relative to load
> > time
> > -     *    addresses what will cause an issue in case of load start
> > address
> > -     *    isn't equal to linker start address.
> > -     * 2. Addresses in stack are all load time relative which can
> > be an
> > -     *    issue in case when load start address isn't equal to
> > linker
> > -     *    start address.
> > -     *
> > -     * We can't return to the caller because the stack was reseted
> > -     * and it may have stash some variable on the stack.
> > -     * Jump to a brand new function as the stack was reseted
> > -     */
> > +void __init remove_identity_mapping(void)
> > +{
> > +    unsigned int i;
> > +    pte_t *pgtbl;
> > +    unsigned int index, xen_index;
> > +    unsigned long load_addr = LINK_TO_LOAD(_start);
> >  
> > -    switch_stack_and_jump((unsigned long)cpu0_boot_stack +
> > STACK_SIZE,
> > -                          cont_after_mmu_is_enabled);
> > +    for ( pgtbl = stage1_pgtbl_root, i = 0;
> > +          i <= (CONFIG_PAGING_LEVELS - 1);
> 
> i < CONFIG_PAGING_LEVELS ? But maybe it would be easier for i to
> start
> at CONFIG_PAGING_LEVELS and be decremented, simplifying ...
> 
> > +          i++ )
> > +    {
> > +        index = pt_index(CONFIG_PAGING_LEVELS - 1 - i, load_addr);
> > +        xen_index = pt_index(CONFIG_PAGING_LEVELS - 1 - i,
> > XEN_VIRT_START);
> 
> ... these two expressions?
It makes sense. I'll update this part of the code.

> 
> > +        if ( index != xen_index )
> > +        {
> > +            pgtbl[index].pte = 0;
> > +            break;
> > +        }
> 
> Is this enough? When load and link address are pretty close (but not
> overlapping), can't they share a leaf table, in which case you need
> to clear more than just a single entry? The present overlap check
> looks to be 4k-granular, not 2M (in which case this couldn't happen;
> IOW adjusting the overlap check may also be a way out).
At the start of setup_initial_pagetables() there is a code which checks
that load and link address don't overlap:

    if ( (linker_start != load_start) &&
         (linker_start <= load_end) && (load_start <= linker_end) )
    {
        early_printk("(XEN) linker and load address ranges overlap\n");
        die();
    }

So the closest difference between load and link address can be 4kb.
Otherwise load and link address ranges are equal ( as we can't map less
then 4kb).

Let's take concrete examples:
  Load address range is   0x8020_0000 - 0x8020_0FFF
  Linker address range is 0x8020_1000 - 0x8020_1FFF
  MMU mode: Sv39 ( so we have 3 page tables )

  So we have:
    * L2 index = 2, L1 index = 1, L0 index = 0 for load address
    * L2 index = 2, L1 index = 1, L0 index = 1 for linker address
  Thereby we have two different L0 tables for load and linker address 
ranges.
  And it looks like it is pretty safe to remove only one L0 index that
was used for identity mapping.

Is it possible that I missed something?

~ Oleksii
Jan Beulich July 7, 2023, 10:51 a.m. UTC | #3
On 07.07.2023 12:37, Oleksii wrote:
> On Thu, 2023-07-06 at 13:35 +0200, Jan Beulich wrote:
>> On 19.06.2023 15:34, Oleksii Kurochko wrote:
>>> --- a/xen/arch/riscv/include/asm/config.h
>>> +++ b/xen/arch/riscv/include/asm/config.h
>>> @@ -1,3 +1,5 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +
>>>  #ifndef __RISCV_CONFIG_H__
>>>  #define __RISCV_CONFIG_H__
>>>  
>>
>> Unrelated change?
> It  should be part of [PATCH v2 5/6] xen/riscv: introduce identity
> mapping.

Hmm, here we're discussing "[PATCH v2 4/6] xen/riscv: introduce identity
mapping". I'm confused, I guess.

>>> --- a/xen/arch/riscv/mm.c
>>> +++ b/xen/arch/riscv/mm.c
>>> @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
>>>  #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
>>>  #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
>>>  
>>> +/*
>>> + * Should be removed as soon as enough headers will be merged for
>>> inclusion of
>>> + * <xen/lib.h>.
>>> + */
>>> +#define ARRAY_SIZE(arr)                (sizeof(arr) /
>>> sizeof((arr)[0]))
>>> +
>>>  /*
>>>   * It is expected that Xen won't be more then 2 MB.
>>>   * The check in xen.lds.S guarantees that.
>>> @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
>>>   *
>>>   * It might be needed one more page table in case when Xen load
>>> address
>>>   * isn't 2 MB aligned.
>>> + *
>>> + * (CONFIG_PAGING_LEVELS - 1) page tables are needed for identity
>>> mapping.
>>>   */
>>> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
>>> +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1)
>>
>> How come the extra page (see the comment sentence in context) isn't
>> needed for the identity-mapping case?
> It is needed to allocate no more than two 'nonroot' page tables (L0 and
> L1 in case of Sv39 ) as page 'root' table ( L2 in case of Sv39 ) is
> always re-used.
> 
> The same ( only 'nonroot' page tables might be needed to allocate )
> works for any MMU mode.

Of course, but if you cross a 2Mb boundary you'll need 2 L0 tables.

>>> @@ -255,25 +262,30 @@ void __init noreturn noinline enable_mmu()
>>>      csr_write(CSR_SATP,
>>>                PFN_DOWN((unsigned long)stage1_pgtbl_root) |
>>>                RV_STAGE1_MODE << SATP_MODE_SHIFT);
>>> +}
>>>  
>>> -    asm volatile ( ".p2align 2" );
>>> - mmu_is_enabled:
>>> -    /*
>>> -     * Stack should be re-inited as:
>>> -     * 1. Right now an address of the stack is relative to load
>>> time
>>> -     *    addresses what will cause an issue in case of load start
>>> address
>>> -     *    isn't equal to linker start address.
>>> -     * 2. Addresses in stack are all load time relative which can
>>> be an
>>> -     *    issue in case when load start address isn't equal to
>>> linker
>>> -     *    start address.
>>> -     *
>>> -     * We can't return to the caller because the stack was reseted
>>> -     * and it may have stash some variable on the stack.
>>> -     * Jump to a brand new function as the stack was reseted
>>> -     */
>>> +void __init remove_identity_mapping(void)
>>> +{
>>> +    unsigned int i;
>>> +    pte_t *pgtbl;
>>> +    unsigned int index, xen_index;
>>> +    unsigned long load_addr = LINK_TO_LOAD(_start);
>>>  
>>> -    switch_stack_and_jump((unsigned long)cpu0_boot_stack +
>>> STACK_SIZE,
>>> -                          cont_after_mmu_is_enabled);
>>> +    for ( pgtbl = stage1_pgtbl_root, i = 0;
>>> +          i <= (CONFIG_PAGING_LEVELS - 1);
>>
>> i < CONFIG_PAGING_LEVELS ? But maybe it would be easier for i to
>> start
>> at CONFIG_PAGING_LEVELS and be decremented, simplifying ...
>>
>>> +          i++ )
>>> +    {
>>> +        index = pt_index(CONFIG_PAGING_LEVELS - 1 - i, load_addr);
>>> +        xen_index = pt_index(CONFIG_PAGING_LEVELS - 1 - i,
>>> XEN_VIRT_START);
>>
>> ... these two expressions?
> It makes sense. I'll update this part of the code.
> 
>>
>>> +        if ( index != xen_index )
>>> +        {
>>> +            pgtbl[index].pte = 0;
>>> +            break;
>>> +        }
>>
>> Is this enough? When load and link address are pretty close (but not
>> overlapping), can't they share a leaf table, in which case you need
>> to clear more than just a single entry? The present overlap check
>> looks to be 4k-granular, not 2M (in which case this couldn't happen;
>> IOW adjusting the overlap check may also be a way out).
> At the start of setup_initial_pagetables() there is a code which checks
> that load and link address don't overlap:
> 
>     if ( (linker_start != load_start) &&
>          (linker_start <= load_end) && (load_start <= linker_end) )
>     {
>         early_printk("(XEN) linker and load address ranges overlap\n");
>         die();
>     }
> 
> So the closest difference between load and link address can be 4kb.
> Otherwise load and link address ranges are equal ( as we can't map less
> then 4kb).
> 
> Let's take concrete examples:
>   Load address range is   0x8020_0000 - 0x8020_0FFF
>   Linker address range is 0x8020_1000 - 0x8020_1FFF
>   MMU mode: Sv39 ( so we have 3 page tables )
> 
>   So we have:
>     * L2 index = 2, L1 index = 1, L0 index = 0 for load address
>     * L2 index = 2, L1 index = 1, L0 index = 1 for linker address
>   Thereby we have two different L0 tables for load and linker address 
> ranges.
>   And it looks like it is pretty safe to remove only one L0 index that
> was used for identity mapping.
> 
> Is it possible that I missed something?

Looks as if you are thinking of only a Xen which fits in 4k. The code
here wants to cope with Xen getting quite a bit bigger.

Jan
Oleksii July 10, 2023, 8:53 a.m. UTC | #4
On Fri, 2023-07-07 at 12:51 +0200, Jan Beulich wrote:
> On 07.07.2023 12:37, Oleksii wrote:
> > On Thu, 2023-07-06 at 13:35 +0200, Jan Beulich wrote:
> > > On 19.06.2023 15:34, Oleksii Kurochko wrote:
> > > > --- a/xen/arch/riscv/include/asm/config.h
> > > > +++ b/xen/arch/riscv/include/asm/config.h
> > > > @@ -1,3 +1,5 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > > +
> > > >  #ifndef __RISCV_CONFIG_H__
> > > >  #define __RISCV_CONFIG_H__
> > > >  
> > > 
> > > Unrelated change?
> > It  should be part of [PATCH v2 5/6] xen/riscv: introduce identity
> > mapping.
> 
> Hmm, here we're discussing "[PATCH v2 4/6] xen/riscv: introduce
> identity
> mapping". I'm confused, I guess.
Sorry for confusion. i meant the patch: [PATCH v2 5/6] xen/riscv: add
SPDX tags.

> 
> > > > --- a/xen/arch/riscv/mm.c
> > > > +++ b/xen/arch/riscv/mm.c
> > > > @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
> > > >  #define LOAD_TO_LINK(addr) ((unsigned long)(addr) -
> > > > phys_offset)
> > > >  #define LINK_TO_LOAD(addr) ((unsigned long)(addr) +
> > > > phys_offset)
> > > >  
> > > > +/*
> > > > + * Should be removed as soon as enough headers will be merged
> > > > for
> > > > inclusion of
> > > > + * <xen/lib.h>.
> > > > + */
> > > > +#define ARRAY_SIZE(arr)                (sizeof(arr) /
> > > > sizeof((arr)[0]))
> > > > +
> > > >  /*
> > > >   * It is expected that Xen won't be more then 2 MB.
> > > >   * The check in xen.lds.S guarantees that.
> > > > @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
> > > >   *
> > > >   * It might be needed one more page table in case when Xen
> > > > load
> > > > address
> > > >   * isn't 2 MB aligned.
> > > > + *
> > > > + * (CONFIG_PAGING_LEVELS - 1) page tables are needed for
> > > > identity
> > > > mapping.
> > > >   */
> > > > -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> > > > +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 +
> > > > 1)
> > > 
> > > How come the extra page (see the comment sentence in context)
> > > isn't
> > > needed for the identity-mapping case?
> > It is needed to allocate no more than two 'nonroot' page tables (L0
> > and
> > L1 in case of Sv39 ) as page 'root' table ( L2 in case of Sv39 ) is
> > always re-used.
> > 
> > The same ( only 'nonroot' page tables might be needed to allocate )
> > works for any MMU mode.
> 
> Of course, but if you cross a 2Mb boundary you'll need 2 L0 tables.
Yes, in the case of crossing a 2Mb boundary, it will require 2 L0
tables.

Then, the number of required page tables is needed depending on Xen
size and load address alignment. Because for each 2Mb, we need a new L0
table.

Sure, this is not needed now ( as in xen.lds.S, we have a Xen size
check ), but if someone increases Xen size binary to 4Mb, then the
amount of page tables should be updated too.
Should we take into account that?

> 
> > > > @@ -255,25 +262,30 @@ void __init noreturn noinline
> > > > enable_mmu()
> > > >      csr_write(CSR_SATP,
> > > >                PFN_DOWN((unsigned long)stage1_pgtbl_root) |
> > > >                RV_STAGE1_MODE << SATP_MODE_SHIFT);
> > > > +}
> > > >  
> > > > -    asm volatile ( ".p2align 2" );
> > > > - mmu_is_enabled:
> > > > -    /*
> > > > -     * Stack should be re-inited as:
> > > > -     * 1. Right now an address of the stack is relative to
> > > > load
> > > > time
> > > > -     *    addresses what will cause an issue in case of load
> > > > start
> > > > address
> > > > -     *    isn't equal to linker start address.
> > > > -     * 2. Addresses in stack are all load time relative which
> > > > can
> > > > be an
> > > > -     *    issue in case when load start address isn't equal to
> > > > linker
> > > > -     *    start address.
> > > > -     *
> > > > -     * We can't return to the caller because the stack was
> > > > reseted
> > > > -     * and it may have stash some variable on the stack.
> > > > -     * Jump to a brand new function as the stack was reseted
> > > > -     */
> > > > +void __init remove_identity_mapping(void)
> > > > +{
> > > > +    unsigned int i;
> > > > +    pte_t *pgtbl;
> > > > +    unsigned int index, xen_index;
> > > > +    unsigned long load_addr = LINK_TO_LOAD(_start);
> > > >  
> > > > -    switch_stack_and_jump((unsigned long)cpu0_boot_stack +
> > > > STACK_SIZE,
> > > > -                          cont_after_mmu_is_enabled);
> > > > +    for ( pgtbl = stage1_pgtbl_root, i = 0;
> > > > +          i <= (CONFIG_PAGING_LEVELS - 1);
> > > 
> > > i < CONFIG_PAGING_LEVELS ? But maybe it would be easier for i to
> > > start
> > > at CONFIG_PAGING_LEVELS and be decremented, simplifying ...
> > > 
> > > > +          i++ )
> > > > +    {
> > > > +        index = pt_index(CONFIG_PAGING_LEVELS - 1 - i,
> > > > load_addr);
> > > > +        xen_index = pt_index(CONFIG_PAGING_LEVELS - 1 - i,
> > > > XEN_VIRT_START);
> > > 
> > > ... these two expressions?
> > It makes sense. I'll update this part of the code.
> > 
> > > 
> > > > +        if ( index != xen_index )
> > > > +        {
> > > > +            pgtbl[index].pte = 0;
> > > > +            break;
> > > > +        }
> > > 
> > > Is this enough? When load and link address are pretty close (but
> > > not
> > > overlapping), can't they share a leaf table, in which case you
> > > need
> > > to clear more than just a single entry? The present overlap check
> > > looks to be 4k-granular, not 2M (in which case this couldn't
> > > happen;
> > > IOW adjusting the overlap check may also be a way out).
> > At the start of setup_initial_pagetables() there is a code which
> > checks
> > that load and link address don't overlap:
> > 
> >     if ( (linker_start != load_start) &&
> >          (linker_start <= load_end) && (load_start <= linker_end) )
> >     {
> >         early_printk("(XEN) linker and load address ranges
> > overlap\n");
> >         die();
> >     }
> > 
> > So the closest difference between load and link address can be 4kb.
> > Otherwise load and link address ranges are equal ( as we can't map
> > less
> > then 4kb).
> > 
> > Let's take concrete examples:
> >   Load address range is   0x8020_0000 - 0x8020_0FFF
> >   Linker address range is 0x8020_1000 - 0x8020_1FFF
> >   MMU mode: Sv39 ( so we have 3 page tables )
> > 
> >   So we have:
> >     * L2 index = 2, L1 index = 1, L0 index = 0 for load address
> >     * L2 index = 2, L1 index = 1, L0 index = 1 for linker address
> >   Thereby we have two different L0 tables for load and linker
> > address 
> > ranges.
> >   And it looks like it is pretty safe to remove only one L0 index
> > that
> > was used for identity mapping.
> > 
> > Is it possible that I missed something?
> 
> Looks as if you are thinking of only a Xen which fits in 4k. The code
> here wants to cope with Xen getting quite a bit bigger.

Yeah, I missed that when I tried to come up with an example.

So it will probably be more universal if we recursively go through the
whole identity mapping and unmap each pte individually.
Jan Beulich July 10, 2023, 9:34 a.m. UTC | #5
On 10.07.2023 10:53, Oleksii wrote:
> On Fri, 2023-07-07 at 12:51 +0200, Jan Beulich wrote:
>> On 07.07.2023 12:37, Oleksii wrote:
>>> On Thu, 2023-07-06 at 13:35 +0200, Jan Beulich wrote:
>>>> On 19.06.2023 15:34, Oleksii Kurochko wrote:
>>>>> --- a/xen/arch/riscv/mm.c
>>>>> +++ b/xen/arch/riscv/mm.c
>>>>> @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
>>>>>  #define LOAD_TO_LINK(addr) ((unsigned long)(addr) -
>>>>> phys_offset)
>>>>>  #define LINK_TO_LOAD(addr) ((unsigned long)(addr) +
>>>>> phys_offset)
>>>>>  
>>>>> +/*
>>>>> + * Should be removed as soon as enough headers will be merged
>>>>> for
>>>>> inclusion of
>>>>> + * <xen/lib.h>.
>>>>> + */
>>>>> +#define ARRAY_SIZE(arr)                (sizeof(arr) /
>>>>> sizeof((arr)[0]))
>>>>> +
>>>>>  /*
>>>>>   * It is expected that Xen won't be more then 2 MB.
>>>>>   * The check in xen.lds.S guarantees that.
>>>>> @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
>>>>>   *
>>>>>   * It might be needed one more page table in case when Xen
>>>>> load
>>>>> address
>>>>>   * isn't 2 MB aligned.
>>>>> + *
>>>>> + * (CONFIG_PAGING_LEVELS - 1) page tables are needed for
>>>>> identity
>>>>> mapping.
>>>>>   */
>>>>> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
>>>>> +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 +
>>>>> 1)
>>>>
>>>> How come the extra page (see the comment sentence in context)
>>>> isn't
>>>> needed for the identity-mapping case?
>>> It is needed to allocate no more than two 'nonroot' page tables (L0
>>> and
>>> L1 in case of Sv39 ) as page 'root' table ( L2 in case of Sv39 ) is
>>> always re-used.
>>>
>>> The same ( only 'nonroot' page tables might be needed to allocate )
>>> works for any MMU mode.
>>
>> Of course, but if you cross a 2Mb boundary you'll need 2 L0 tables.
> Yes, in the case of crossing a 2Mb boundary, it will require 2 L0
> tables.
> 
> Then, the number of required page tables is needed depending on Xen
> size and load address alignment. Because for each 2Mb, we need a new L0
> table.
> 
> Sure, this is not needed now ( as in xen.lds.S, we have a Xen size
> check ), but if someone increases Xen size binary to 4Mb, then the
> amount of page tables should be updated too.
> Should we take into account that?

Perhaps by way of a BUILD_BUG_ON(), yes. We want to avoid setting up
a (runtime) trap for someone to fall into.

Jan
diff mbox series

Patch

diff --git a/xen/arch/riscv/include/asm/config.h b/xen/arch/riscv/include/asm/config.h
index 38862df0b8..fa90ae0898 100644
--- a/xen/arch/riscv/include/asm/config.h
+++ b/xen/arch/riscv/include/asm/config.h
@@ -1,3 +1,5 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
 #ifndef __RISCV_CONFIG_H__
 #define __RISCV_CONFIG_H__
 
diff --git a/xen/arch/riscv/include/asm/mm.h b/xen/arch/riscv/include/asm/mm.h
index 996041ce81..500fdc9c5a 100644
--- a/xen/arch/riscv/include/asm/mm.h
+++ b/xen/arch/riscv/include/asm/mm.h
@@ -9,7 +9,8 @@ 
 void setup_initial_pagetables(void);
 
 void enable_mmu(void);
-void cont_after_mmu_is_enabled(void);
+
+void remove_identity_mapping(void);
 
 void calc_phys_offset(void);
 
diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
index 570033f17f..2693b817c5 100644
--- a/xen/arch/riscv/mm.c
+++ b/xen/arch/riscv/mm.c
@@ -25,6 +25,12 @@  unsigned long __ro_after_init phys_offset;
 #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
 #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
 
+/*
+ * Should be removed as soon as enough headers will be merged for inclusion of
+ * <xen/lib.h>.
+ */
+#define ARRAY_SIZE(arr)		(sizeof(arr) / sizeof((arr)[0]))
+
 /*
  * It is expected that Xen won't be more then 2 MB.
  * The check in xen.lds.S guarantees that.
@@ -35,8 +41,10 @@  unsigned long __ro_after_init phys_offset;
  *
  * It might be needed one more page table in case when Xen load address
  * isn't 2 MB aligned.
+ *
+ * (CONFIG_PAGING_LEVELS - 1) page tables are needed for identity mapping.
  */
-#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
+#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1)
 
 pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
 stage1_pgtbl_root[PAGETABLE_ENTRIES];
@@ -75,6 +83,7 @@  static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
     unsigned int index;
     pte_t *pgtbl;
     unsigned long page_addr;
+    bool is_identity_mapping = map_start == pa_start;
 
     if ( (unsigned long)_start % XEN_PT_LEVEL_SIZE(0) )
     {
@@ -108,16 +117,18 @@  static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
             {
                 unsigned long paddr = (page_addr - map_start) + pa_start;
                 unsigned int permissions = PTE_LEAF_DEFAULT;
+                unsigned long addr = is_identity_mapping
+                                     ? page_addr : LINK_TO_LOAD(page_addr);
                 pte_t pte_to_be_written;
 
                 index = pt_index(0, page_addr);
 
-                if ( is_kernel_text(LINK_TO_LOAD(page_addr)) ||
-                     is_kernel_inittext(LINK_TO_LOAD(page_addr)) )
-                    permissions =
-                        PTE_EXECUTABLE | PTE_READABLE | PTE_VALID;
+                if ( is_kernel_text(addr) ||
+                     is_kernel_inittext(addr) )
+                        permissions =
+                            PTE_EXECUTABLE | PTE_READABLE | PTE_VALID;
 
-                if ( is_kernel_rodata(LINK_TO_LOAD(page_addr)) )
+                if ( is_kernel_rodata(addr) )
                     permissions = PTE_READABLE | PTE_VALID;
 
                 pte_to_be_written = paddr_to_pte(paddr, permissions);
@@ -232,22 +243,18 @@  void __init setup_initial_pagetables(void)
                           linker_start,
                           linker_end,
                           load_start);
+
+    if ( linker_start == load_start )
+        return;
+
+    setup_initial_mapping(&mmu_desc,
+                          load_start,
+                          load_end,
+                          load_start);
 }
 
-void __init noreturn noinline enable_mmu()
+void __init enable_mmu(void)
 {
-    /*
-     * Calculate a linker time address of the mmu_is_enabled
-     * label and update CSR_STVEC with it.
-     * MMU is configured in a way where linker addresses are mapped
-     * on load addresses so in a case when linker addresses are not equal
-     * to load addresses, after MMU is enabled, it will cause
-     * an exception and jump to linker time addresses.
-     * Otherwise if load addresses are equal to linker addresses the code
-     * after mmu_is_enabled label will be executed without exception.
-     */
-    csr_write(CSR_STVEC, LOAD_TO_LINK((unsigned long)&&mmu_is_enabled));
-
     /* Ensure page table writes precede loading the SATP */
     sfence_vma();
 
@@ -255,25 +262,30 @@  void __init noreturn noinline enable_mmu()
     csr_write(CSR_SATP,
               PFN_DOWN((unsigned long)stage1_pgtbl_root) |
               RV_STAGE1_MODE << SATP_MODE_SHIFT);
+}
 
-    asm volatile ( ".p2align 2" );
- mmu_is_enabled:
-    /*
-     * Stack should be re-inited as:
-     * 1. Right now an address of the stack is relative to load time
-     *    addresses what will cause an issue in case of load start address
-     *    isn't equal to linker start address.
-     * 2. Addresses in stack are all load time relative which can be an
-     *    issue in case when load start address isn't equal to linker
-     *    start address.
-     *
-     * We can't return to the caller because the stack was reseted
-     * and it may have stash some variable on the stack.
-     * Jump to a brand new function as the stack was reseted
-     */
+void __init remove_identity_mapping(void)
+{
+    unsigned int i;
+    pte_t *pgtbl;
+    unsigned int index, xen_index;
+    unsigned long load_addr = LINK_TO_LOAD(_start);
 
-    switch_stack_and_jump((unsigned long)cpu0_boot_stack + STACK_SIZE,
-                          cont_after_mmu_is_enabled);
+    for ( pgtbl = stage1_pgtbl_root, i = 0;
+          i <= (CONFIG_PAGING_LEVELS - 1);
+          i++ )
+    {
+        index = pt_index(CONFIG_PAGING_LEVELS - 1 - i, load_addr);
+        xen_index = pt_index(CONFIG_PAGING_LEVELS - 1 - i, XEN_VIRT_START);
+
+        if ( index != xen_index )
+        {
+            pgtbl[index].pte = 0;
+            break;
+        }
+
+        pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]);
+    }
 }
 
 /*
diff --git a/xen/arch/riscv/riscv64/head.S b/xen/arch/riscv/riscv64/head.S
index 850fbb58a7..41983ffe63 100644
--- a/xen/arch/riscv/riscv64/head.S
+++ b/xen/arch/riscv/riscv64/head.S
@@ -29,8 +29,42 @@  ENTRY(start)
 
         jal     reset_stack
 
+        /*
+         * save hart_id and dtb_base as a0 and a1 register can be used
+         * by C code ( f.e. setup_initial_pagetables will update a0 and
+         * a1 )
+         */
+        mv      s0, a0
+        mv      s1, a1
+
         jal     calc_phys_offset
 
+        jal     setup_initial_pagetables
+
+        jal     enable_mmu
+
+        la      t1, phys_offset
+        REG_L   t1, (t1)
+
+        /* Calculate proper VA after jump from 1:1 mapping */
+        la      t0, .L_primary_switched
+        sub     t0, t0, t1
+
+        /* Jump from 1:1 mapping world */
+        jr      t0
+
+.L_primary_switched:
+        /*
+         * cpu0_boot_stack address is 1:1 mapping related so it should be
+         * recalculated after jump from 1:1 mapping world as 1:1 mapping
+         * will be removed soon in start_xen().
+         */
+        jal     reset_stack
+
+        /* restore bootcpu_id and dtb address */
+        mv      a0, s0
+        mv      a1, s1
+
         tail    start_xen
 
         .section .text, "ax", %progbits
diff --git a/xen/arch/riscv/setup.c b/xen/arch/riscv/setup.c
index 845d18d86f..c4ef0b3165 100644
--- a/xen/arch/riscv/setup.c
+++ b/xen/arch/riscv/setup.c
@@ -11,20 +11,10 @@  unsigned char __initdata cpu0_boot_stack[STACK_SIZE]
 void __init noreturn start_xen(unsigned long bootcpu_id,
                                paddr_t dtb_addr)
 {
-    early_printk("Hello from C env\n");
-
-    setup_initial_pagetables();
-
-    enable_mmu();
-
-    for ( ;; )
-        asm volatile ("wfi");
+    remove_identity_mapping();
 
-    unreachable();
-}
+    early_printk("Hello from C env\n");
 
-void __init noreturn cont_after_mmu_is_enabled(void)
-{
     early_printk("All set up\n");
 
     for ( ;; )