diff mbox series

[v1,5/8] xen/riscv: introduce identity mapping

Message ID 32aef31768cd81ffc8c848af6c29cd8510bbbf6d.1686080337.git.oleksii.kurochko@gmail.com (mailing list archive)
State Superseded
Headers show
Series xen/riscv: introduce identity mapping | expand

Commit Message

Oleksii Kurochko June 6, 2023, 7:55 p.m. UTC
The way how switch to virtual address was implemented in the
commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
wasn't safe enough so identity mapping was introduced and
used.

Fixes: e66003e7be ("xen/riscv: introduce setup_initial_pages")
Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
 xen/arch/riscv/include/asm/mm.h |  3 +-
 xen/arch/riscv/mm.c             | 99 ++++++++++++++++++++++-----------
 xen/arch/riscv/riscv64/head.S   | 30 ++++++++++
 xen/arch/riscv/setup.c          | 14 +----
 4 files changed, 99 insertions(+), 47 deletions(-)

Comments

Jan Beulich June 12, 2023, 1:48 p.m. UTC | #1
On 06.06.2023 21:55, Oleksii Kurochko wrote:
> The way how switch to virtual address was implemented in the
> commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
> wasn't safe enough so identity mapping was introduced and
> used.

I don't think this is sufficient as a description. You want to make
clear what the "not safe enough" is, and you also want to go into
more detail as to the solution chosen. I'm particularly puzzled that
you map just two singular pages ...

> @@ -35,8 +40,10 @@ static unsigned long phys_offset;
>   *
>   * It might be needed one more page table in case when Xen load address
>   * isn't 2 MB aligned.
> + *
> + * 3 additional page tables are needed for identity mapping.
>   */
> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1 + 3)

What is this 3 coming from? It feels like the value should (again)
somehow depend on CONFIG_PAGING_LEVELS.

> @@ -108,16 +116,18 @@ static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
>              {
>                  unsigned long paddr = (page_addr - map_start) + pa_start;
>                  unsigned int permissions = PTE_LEAF_DEFAULT;
> +                unsigned long addr = (is_identity_mapping) ?

Nit: No need for parentheses here.

> +                                     page_addr : LINK_TO_LOAD(page_addr);

As a remark, while we want binary operators at the end of lines when
wrapping, we usually do things differently for the ternary operator:
Either

                unsigned long addr = is_identity_mapping
                                     ? page_addr : LINK_TO_LOAD(page_addr);

or

                unsigned long addr = is_identity_mapping
                                     ? page_addr
                                     : LINK_TO_LOAD(page_addr);

.

> @@ -232,22 +242,27 @@ void __init setup_initial_pagetables(void)
>                            linker_start,
>                            linker_end,
>                            load_start);
> +
> +    if ( linker_start == load_start )
> +        return;
> +
> +    setup_initial_mapping(&mmu_desc,
> +                          load_start,
> +                          load_start + PAGE_SIZE,
> +                          load_start);
> +
> +    setup_initial_mapping(&mmu_desc,
> +                          (unsigned long)cpu0_boot_stack,
> +                          (unsigned long)cpu0_boot_stack + PAGE_SIZE,

Shouldn't this be STACK_SIZE (and then also be prepared for
STACK_SIZE > PAGE_SIZE)?

> +                          (unsigned long)cpu0_boot_stack);
>  }
>  
> -void __init noreturn noinline enable_mmu()
> +/*
> + * enable_mmu() can't be __init because __init section isn't part of identity
> + * mapping so it will cause an issue after MMU will be enabled.
> + */

As hinted at above already - perhaps the identity mapping wants to be
larger, up to covering the entire Xen image? Since it's temporary
only anyway, you could even consider using a large page (and RWX
permission). You already require no overlap of link and load addresses,
so at least small page mappings ought to be possible for the entire
image.

> @@ -255,25 +270,41 @@ void __init noreturn noinline enable_mmu()
>      csr_write(CSR_SATP,
>                PFN_DOWN((unsigned long)stage1_pgtbl_root) |
>                RV_STAGE1_MODE << SATP_MODE_SHIFT);
> +}
> +
> +void __init remove_identity_mapping(void)
> +{
> +    int i, j;

Nit: unsigned int please.

> +    pte_t *pgtbl;
> +    unsigned int index, xen_index;

These would all probably better be declared in the narrowest possible
scope.

> -    asm volatile ( ".p2align 2" );
> - mmu_is_enabled:
>      /*
> -     * Stack should be re-inited as:
> -     * 1. Right now an address of the stack is relative to load time
> -     *    addresses what will cause an issue in case of load start address
> -     *    isn't equal to linker start address.
> -     * 2. Addresses in stack are all load time relative which can be an
> -     *    issue in case when load start address isn't equal to linker
> -     *    start address.
> -     *
> -     * We can't return to the caller because the stack was reseted
> -     * and it may have stash some variable on the stack.
> -     * Jump to a brand new function as the stack was reseted
> +     * id_addrs should be in sync with id mapping in
> +     * setup_initial_pagetables()

What is "id" meant to stand for here? Also if things need keeping in
sync, then a similar comment should exist on the other side.

>       */
> +    unsigned long id_addrs[] =  {
> +                                 LINK_TO_LOAD(_start),
> +                                 LINK_TO_LOAD(cpu0_boot_stack),
> +                                };
>  
> -    switch_stack_and_jump((unsigned long)cpu0_boot_stack + STACK_SIZE,
> -                          cont_after_mmu_is_enabled);
> +    pgtbl = stage1_pgtbl_root;
> +
> +    for ( j = 0; j < ARRAY_SIZE(id_addrs); j++ )
> +    {
> +        for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS - 1; i >= 0; i-- )
> +        {
> +            index = pt_index(i, id_addrs[j]);
> +            xen_index = pt_index(i, XEN_VIRT_START);
> +
> +            if ( index != xen_index )
> +            {
> +                pgtbl[index].pte = 0;
> +                break;
> +            }

Up to here I understand index specifies a particular PTE within pgtbl[].

> +            pgtbl = &pgtbl[index];

But then how can this be the continuation of the page table walk? Don't
you need to read the address out of the PTE?

> --- a/xen/arch/riscv/riscv64/head.S
> +++ b/xen/arch/riscv/riscv64/head.S
> @@ -31,6 +31,36 @@ ENTRY(start)
>  
>          jal     calc_phys_offset
>  
> +        jal     setup_initial_pagetables
> +
> +        jal     enable_mmu
> +
> +        /*
> +         * Calculate physical offset
> +         *
> +         * We can't re-use a value in phys_offset variable here as
> +         * phys_offset is located in .bss and this section isn't
> +         * 1:1 mapped and an access to it will cause MMU fault
> +         */
> +        li      t0, XEN_VIRT_START
> +        la      t1, start
> +        sub     t1, t1, t0

If I'm not mistaken this calculates start - XEN_VIRT_START, and ...

> +        /* Calculate proper VA after jump from 1:1 mapping */
> +        la      t0, .L_primary_switched
> +        sub     t0, t0, t1

... then this .L_primary_switched - (start - XEN_VIRT_START). Which can
also be expressed as (.L_primary_switched - start) + XEN_VIRT_START,
the first part of which gas ought to be able to resolve for you. But
upon experimenting a little it looks like it can't. (I had thought of
something along the lines of

        li      t0, .L_primary_switched - start
        li      t1, XEN_VIRT_START
        add     t0, t0, t1

or even

        li      t1, XEN_VIRT_START
        add     t0, t1, %pcrel_lo(.L_primary_switched - start)

.)

Jan
Jan Beulich June 12, 2023, 2:24 p.m. UTC | #2
On 12.06.2023 15:48, Jan Beulich wrote:
> On 06.06.2023 21:55, Oleksii Kurochko wrote:
>> -void __init noreturn noinline enable_mmu()
>> +/*
>> + * enable_mmu() can't be __init because __init section isn't part of identity
>> + * mapping so it will cause an issue after MMU will be enabled.
>> + */
> 
> As hinted at above already - perhaps the identity mapping wants to be
> larger, up to covering the entire Xen image? Since it's temporary
> only anyway, you could even consider using a large page (and RWX
> permission). You already require no overlap of link and load addresses,
> so at least small page mappings ought to be possible for the entire
> image.

To expand on that: Assume a future change on this path results in a call
to memcpy() or memset() being introduced by the compiler (and then let's
further assume this only occurs for a specific compiler version). Right
now such a case would be noticed simply because we don't build those
library functions yet. But it'll likely be a perplexing crash once a full
hypervisor can be built, the more that exception handlers also aren't
mapped.

>> - mmu_is_enabled:
>>      /*
>> -     * Stack should be re-inited as:
>> -     * 1. Right now an address of the stack is relative to load time
>> -     *    addresses what will cause an issue in case of load start address
>> -     *    isn't equal to linker start address.
>> -     * 2. Addresses in stack are all load time relative which can be an
>> -     *    issue in case when load start address isn't equal to linker
>> -     *    start address.
>> -     *
>> -     * We can't return to the caller because the stack was reseted
>> -     * and it may have stash some variable on the stack.
>> -     * Jump to a brand new function as the stack was reseted
>> +     * id_addrs should be in sync with id mapping in
>> +     * setup_initial_pagetables()
> 
> What is "id" meant to stand for here? Also if things need keeping in
> sync, then a similar comment should exist on the other side.

I guess it's meant to stand for "identity mapping", but the common use
of "id" makes we wonder if the variable wouldn't better be ident_addrs[].

Jan
Oleksii Kurochko June 14, 2023, 9:47 a.m. UTC | #3
On Mon, 2023-06-12 at 15:48 +0200, Jan Beulich wrote:
> On 06.06.2023 21:55, Oleksii Kurochko wrote:
> > The way how switch to virtual address was implemented in the
> > commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
> > wasn't safe enough so identity mapping was introduced and
> > used.
> 
> I don't think this is sufficient as a description. You want to make
> clear what the "not safe enough" is, and you also want to go into
> more detail as to the solution chosen. I'm particularly puzzled that
> you map just two singular pages ...
I'll update a descrption.

I map two pages as it likely to be enough to switch from 1:1 mapping
world. My point is that we need 1:1 mapping only for few instructions
which are located in start() [ in .text.header section ]:

        li      t0, XEN_VIRT_START
        la      t1, start
        sub     t1, t1, t0

        /* Calculate proper VA after jump from 1:1 mapping */
        la      t0, .L_primary_switched
        sub     t0, t0, t1

        /* Jump from 1:1 mapping world */
        jr      t0

And it is needed to map 1:1 cpu0_boot_stack to not to fault when
executing epilog of enable_mmu() function as it accesses a value on the
stack:
    ffffffffc00001b0:       6422                    ld      s0,8(sp)
    ffffffffc00001b2:       0141                    addi    sp,sp,16
    ffffffffc00001b4:       8082                    ret

> 
> > @@ -35,8 +40,10 @@ static unsigned long phys_offset;
> >   *
> >   * It might be needed one more page table in case when Xen load
> > address
> >   * isn't 2 MB aligned.
> > + *
> > + * 3 additional page tables are needed for identity mapping.
> >   */
> > -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> > +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1 + 3)
> 
> What is this 3 coming from? It feels like the value should (again)
> somehow depend on CONFIG_PAGING_LEVELS.
3 is too much in the current case. It should be 2 more.

The linker address is 0xFFFFFFFC00000000 thereby mapping the linker to
load addresses
we need 3 pages ( with the condition that the size of Xen won't be
larger than 2 MB ) and 1 page if the case when Xen load address isn't
2MV aligned.

We need 2 more page tables because Xen is loaded to address 0x80200000
by OpenSBI and the load address isn't equal to the linker address so we
need additional 2 pages as the L2 table we already allocated when
mapping the linker to load addresses.

And it looks like 2 is enough for Sv48, Sv57 as L4, L3 and L2
pagetables will be already allocated during mapping the linker to load
addresses.

And it only will be too much for Sv32 ( which has only L1, L0 in
general ) but I am not sure that anyone cares about Sv32.

> 
> > @@ -108,16 +116,18 @@ static void __init
> > setup_initial_mapping(struct mmu_desc *mmu_desc,
> >              {
> >                  unsigned long paddr = (page_addr - map_start) +
> > pa_start;
> >                  unsigned int permissions = PTE_LEAF_DEFAULT;
> > +                unsigned long addr = (is_identity_mapping) ?
> 
> Nit: No need for parentheses here.
Thanks.

> 
> > +                                     page_addr :
> > LINK_TO_LOAD(page_addr);
> 
> As a remark, while we want binary operators at the end of lines when
> wrapping, we usually do things differently for the ternary operator:
> Either
> 
>                 unsigned long addr = is_identity_mapping
>                                      ? page_addr :
> LINK_TO_LOAD(page_addr);
> 
> or
> 
>                 unsigned long addr = is_identity_mapping
>                                      ? page_addr
>                                      : LINK_TO_LOAD(page_addr);
It looks better. Thanks.
> 
> .
> 
> > @@ -232,22 +242,27 @@ void __init setup_initial_pagetables(void)
> >                            linker_start,
> >                            linker_end,
> >                            load_start);
> > +
> > +    if ( linker_start == load_start )
> > +        return;
> > +
> > +    setup_initial_mapping(&mmu_desc,
> > +                          load_start,
> > +                          load_start + PAGE_SIZE,
> > +                          load_start);
> > +
> > +    setup_initial_mapping(&mmu_desc,
> > +                          (unsigned long)cpu0_boot_stack,
> > +                          (unsigned long)cpu0_boot_stack +
> > PAGE_SIZE,
> 
> Shouldn't this be STACK_SIZE (and then also be prepared for
> STACK_SIZE > PAGE_SIZE)?
Yes, it will be better to use STACK_SIZE but for 1:1 mapping it will be
enough PAGE_SIZE as I mentioned above this only need for epilogue of
enable_mmu() function.
> 
> > +                          (unsigned long)cpu0_boot_stack);
> >  }
> >  
> > -void __init noreturn noinline enable_mmu()
> > +/*
> > + * enable_mmu() can't be __init because __init section isn't part
> > of identity
> > + * mapping so it will cause an issue after MMU will be enabled.
> > + */
> 
> As hinted at above already - perhaps the identity mapping wants to be
> larger, up to covering the entire Xen image? Since it's temporary
> only anyway, you could even consider using a large page (and RWX
> permission). You already require no overlap of link and load
> addresses,
> so at least small page mappings ought to be possible for the entire
> image.
It makes sense and started to thought about that after I applied the
patch for Dom0 running... I think we can map 1 GB page which will cover
the whole Xen image. Also in that case we haven't to allocate 2 more
page tables.

> 
> > @@ -255,25 +270,41 @@ void __init noreturn noinline enable_mmu()
> >      csr_write(CSR_SATP,
> >                PFN_DOWN((unsigned long)stage1_pgtbl_root) |
> >                RV_STAGE1_MODE << SATP_MODE_SHIFT);
> > +}
> > +
> > +void __init remove_identity_mapping(void)
> > +{
> > +    int i, j;
> 
> Nit: unsigned int please.
Thanks.

> 
> > +    pte_t *pgtbl;
> > +    unsigned int index, xen_index;
> 
> These would all probably better be declared in the narrowest possible
> scope.
Sure.

> 
> > -    asm volatile ( ".p2align 2" );
> > - mmu_is_enabled:
> >      /*
> > -     * Stack should be re-inited as:
> > -     * 1. Right now an address of the stack is relative to load
> > time
> > -     *    addresses what will cause an issue in case of load start
> > address
> > -     *    isn't equal to linker start address.
> > -     * 2. Addresses in stack are all load time relative which can
> > be an
> > -     *    issue in case when load start address isn't equal to
> > linker
> > -     *    start address.
> > -     *
> > -     * We can't return to the caller because the stack was reseted
> > -     * and it may have stash some variable on the stack.
> > -     * Jump to a brand new function as the stack was reseted
> > +     * id_addrs should be in sync with id mapping in
> > +     * setup_initial_pagetables()
> 
> What is "id" meant to stand for here? Also if things need keeping in
> sync, then a similar comment should exist on the other side.
"id" here mean identity.

> 
> >       */
> > +    unsigned long id_addrs[] =  {
> > +                                 LINK_TO_LOAD(_start),
> > +                                 LINK_TO_LOAD(cpu0_boot_stack),
> > +                                };
> >  
> > -    switch_stack_and_jump((unsigned long)cpu0_boot_stack +
> > STACK_SIZE,
> > -                          cont_after_mmu_is_enabled);
> > +    pgtbl = stage1_pgtbl_root;
> > +
> > +    for ( j = 0; j < ARRAY_SIZE(id_addrs); j++ )
> > +    {
> > +        for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS
> > - 1; i >= 0; i-- )
> > +        {
> > +            index = pt_index(i, id_addrs[j]);
> > +            xen_index = pt_index(i, XEN_VIRT_START);
> > +
> > +            if ( index != xen_index )
> > +            {
> > +                pgtbl[index].pte = 0;
> > +                break;
> > +            }
> 
> Up to here I understand index specifies a particular PTE within
> pgtbl[].
> 
> > +            pgtbl = &pgtbl[index];
> 
> But then how can this be the continuation of the page table walk?
> Don't
> you need to read the address out of the PTE?
You are right. it should be:
   pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]); 

> 
> > --- a/xen/arch/riscv/riscv64/head.S
> > +++ b/xen/arch/riscv/riscv64/head.S
> > @@ -31,6 +31,36 @@ ENTRY(start)
> >  
> >          jal     calc_phys_offset
> >  
> > +        jal     setup_initial_pagetables
> > +
> > +        jal     enable_mmu
> > +
> > +        /*
> > +         * Calculate physical offset
> > +         *
> > +         * We can't re-use a value in phys_offset variable here as
> > +         * phys_offset is located in .bss and this section isn't
> > +         * 1:1 mapped and an access to it will cause MMU fault
> > +         */
> > +        li      t0, XEN_VIRT_START
> > +        la      t1, start
> > +        sub     t1, t1, t0
> 
> If I'm not mistaken this calculates start - XEN_VIRT_START, and ...
> 
> > +        /* Calculate proper VA after jump from 1:1 mapping */
> > +        la      t0, .L_primary_switched
> > +        sub     t0, t0, t1
> 
> ... then this .L_primary_switched - (start - XEN_VIRT_START). Which
> can
> also be expressed as (.L_primary_switched - start) + XEN_VIRT_START,
> the first part of which gas ought to be able to resolve for you. But
> upon experimenting a little it looks like it can't. (I had thought of
> something along the lines of
> 
>         li      t0, .L_primary_switched - start
>         li      t1, XEN_VIRT_START
>         add     t0, t0, t1
> 
> or even
> 
>         li      t1, XEN_VIRT_START
>         add     t0, t1, %pcrel_lo(.L_primary_switched - start)
> 
> .)
Calculation of ".L_primary_switched - start" might be an issue for gcc.
I tried to do something similar and recieved or "Error: can't resolve
.L_primary_switched - start" or "Error: illegal operands `li
t0,.L_primary_switched-start'"

~ Oleksii
Oleksii Kurochko June 14, 2023, 9:53 a.m. UTC | #4
On Mon, 2023-06-12 at 16:24 +0200, Jan Beulich wrote:
> On 12.06.2023 15:48, Jan Beulich wrote:
> > On 06.06.2023 21:55, Oleksii Kurochko wrote:
> > > -void __init noreturn noinline enable_mmu()
> > > +/*
> > > + * enable_mmu() can't be __init because __init section isn't
> > > part of identity
> > > + * mapping so it will cause an issue after MMU will be enabled.
> > > + */
> > 
> > As hinted at above already - perhaps the identity mapping wants to
> > be
> > larger, up to covering the entire Xen image? Since it's temporary
> > only anyway, you could even consider using a large page (and RWX
> > permission). You already require no overlap of link and load
> > addresses,
> > so at least small page mappings ought to be possible for the entire
> > image.
> 
> To expand on that: Assume a future change on this path results in a
> call
> to memcpy() or memset() being introduced by the compiler (and then
> let's
> further assume this only occurs for a specific compiler version).
> Right
> now such a case would be noticed simply because we don't build those
> library functions yet. But it'll likely be a perplexing crash once a
> full
> hypervisor can be built, the more that exception handlers also aren't
> mapped.
It makes sense but for some reason it doesn't crash ( I have a bunch of
patches to run dom0 ) but as I mentioned in the previous e-mail I agree
that probably it would be better to map the whole image using 1 Gb page
table for example.


> 
> > > - mmu_is_enabled:
> > >      /*
> > > -     * Stack should be re-inited as:
> > > -     * 1. Right now an address of the stack is relative to load
> > > time
> > > -     *    addresses what will cause an issue in case of load
> > > start address
> > > -     *    isn't equal to linker start address.
> > > -     * 2. Addresses in stack are all load time relative which
> > > can be an
> > > -     *    issue in case when load start address isn't equal to
> > > linker
> > > -     *    start address.
> > > -     *
> > > -     * We can't return to the caller because the stack was
> > > reseted
> > > -     * and it may have stash some variable on the stack.
> > > -     * Jump to a brand new function as the stack was reseted
> > > +     * id_addrs should be in sync with id mapping in
> > > +     * setup_initial_pagetables()
> > 
> > What is "id" meant to stand for here? Also if things need keeping
> > in
> > sync, then a similar comment should exist on the other side.
> 
> I guess it's meant to stand for "identity mapping", but the common
> use
> of "id" makes we wonder if the variable wouldn't better be
> ident_addrs[].
It would be better. but probably we will remove that variable if we
agree to map the whole Xen instead of parts. So I'll wait for your
response before starting to work on new patch series.

Thanks a lot.

~ Oleksii
Jan Beulich June 14, 2023, 10:04 a.m. UTC | #5
On 14.06.2023 11:47, Oleksii wrote:
> On Mon, 2023-06-12 at 15:48 +0200, Jan Beulich wrote:
>> On 06.06.2023 21:55, Oleksii Kurochko wrote:
>>> The way how switch to virtual address was implemented in the
>>> commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
>>> wasn't safe enough so identity mapping was introduced and
>>> used.
>>
>> I don't think this is sufficient as a description. You want to make
>> clear what the "not safe enough" is, and you also want to go into
>> more detail as to the solution chosen. I'm particularly puzzled that
>> you map just two singular pages ...
> I'll update a descrption.
> 
> I map two pages as it likely to be enough to switch from 1:1 mapping

I dislike your use of "likely" here: Unless this code is meant to be
redone anyway, it should just work. Everywhere.

> world. My point is that we need 1:1 mapping only for few instructions
> which are located in start() [ in .text.header section ]:
> 
>         li      t0, XEN_VIRT_START
>         la      t1, start
>         sub     t1, t1, t0
> 
>         /* Calculate proper VA after jump from 1:1 mapping */
>         la      t0, .L_primary_switched
>         sub     t0, t0, t1
> 
>         /* Jump from 1:1 mapping world */
>         jr      t0
> 
> And it is needed to map 1:1 cpu0_boot_stack to not to fault when
> executing epilog of enable_mmu() function as it accesses a value on the
> stack:
>     ffffffffc00001b0:       6422                    ld      s0,8(sp)
>     ffffffffc00001b2:       0141                    addi    sp,sp,16
>     ffffffffc00001b4:       8082                    ret
> 
>>
>>> @@ -35,8 +40,10 @@ static unsigned long phys_offset;
>>>   *
>>>   * It might be needed one more page table in case when Xen load
>>> address
>>>   * isn't 2 MB aligned.
>>> + *
>>> + * 3 additional page tables are needed for identity mapping.
>>>   */
>>> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
>>> +#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1 + 3)
>>
>> What is this 3 coming from? It feels like the value should (again)
>> somehow depend on CONFIG_PAGING_LEVELS.
> 3 is too much in the current case. It should be 2 more.
> 
> The linker address is 0xFFFFFFFC00000000 thereby mapping the linker to
> load addresses
> we need 3 pages ( with the condition that the size of Xen won't be
> larger than 2 MB ) and 1 page if the case when Xen load address isn't
> 2MV aligned.
> 
> We need 2 more page tables because Xen is loaded to address 0x80200000
> by OpenSBI and the load address isn't equal to the linker address so we
> need additional 2 pages as the L2 table we already allocated when
> mapping the linker to load addresses.
> 
> And it looks like 2 is enough for Sv48, Sv57 as L4, L3 and L2
> pagetables will be already allocated during mapping the linker to load
> addresses.

I agree the root table will be there. But depending on load address, I
don't think you can rely on any other tables to be re-usable from the
Xen mappings you already have. So my gut feeling would be

#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1)

>>> @@ -232,22 +242,27 @@ void __init setup_initial_pagetables(void)
>>>                            linker_start,
>>>                            linker_end,
>>>                            load_start);
>>> +
>>> +    if ( linker_start == load_start )
>>> +        return;
>>> +
>>> +    setup_initial_mapping(&mmu_desc,
>>> +                          load_start,
>>> +                          load_start + PAGE_SIZE,
>>> +                          load_start);
>>> +
>>> +    setup_initial_mapping(&mmu_desc,
>>> +                          (unsigned long)cpu0_boot_stack,
>>> +                          (unsigned long)cpu0_boot_stack +
>>> PAGE_SIZE,
>>
>> Shouldn't this be STACK_SIZE (and then also be prepared for
>> STACK_SIZE > PAGE_SIZE)?
> Yes, it will be better to use STACK_SIZE but for 1:1 mapping it will be
> enough PAGE_SIZE as I mentioned above this only need for epilogue of
> enable_mmu() function.

But then it should still be the correct page of the stack that you
map (the top one aiui).

>>> -void __init noreturn noinline enable_mmu()
>>> +/*
>>> + * enable_mmu() can't be __init because __init section isn't part
>>> of identity
>>> + * mapping so it will cause an issue after MMU will be enabled.
>>> + */
>>
>> As hinted at above already - perhaps the identity mapping wants to be
>> larger, up to covering the entire Xen image? Since it's temporary
>> only anyway, you could even consider using a large page (and RWX
>> permission). You already require no overlap of link and load
>> addresses,
>> so at least small page mappings ought to be possible for the entire
>> image.
> It makes sense and started to thought about that after I applied the
> patch for Dom0 running... I think we can map 1 GB page which will cover
> the whole Xen image. Also in that case we haven't to allocate 2 more
> page tables.

But you then need to be careful about not mapping space accesses to
which may have side effects (i.e. non-RAM, which might be MMIO or
holes). Otherwise speculative execution might cause surprises,
unless such is excluded by other guarantees (of which I don't know).

>>> --- a/xen/arch/riscv/riscv64/head.S
>>> +++ b/xen/arch/riscv/riscv64/head.S
>>> @@ -31,6 +31,36 @@ ENTRY(start)
>>>  
>>>          jal     calc_phys_offset
>>>  
>>> +        jal     setup_initial_pagetables
>>> +
>>> +        jal     enable_mmu
>>> +
>>> +        /*
>>> +         * Calculate physical offset
>>> +         *
>>> +         * We can't re-use a value in phys_offset variable here as
>>> +         * phys_offset is located in .bss and this section isn't
>>> +         * 1:1 mapped and an access to it will cause MMU fault
>>> +         */
>>> +        li      t0, XEN_VIRT_START
>>> +        la      t1, start
>>> +        sub     t1, t1, t0
>>
>> If I'm not mistaken this calculates start - XEN_VIRT_START, and ...
>>
>>> +        /* Calculate proper VA after jump from 1:1 mapping */
>>> +        la      t0, .L_primary_switched
>>> +        sub     t0, t0, t1
>>
>> ... then this .L_primary_switched - (start - XEN_VIRT_START). Which
>> can
>> also be expressed as (.L_primary_switched - start) + XEN_VIRT_START,
>> the first part of which gas ought to be able to resolve for you. But
>> upon experimenting a little it looks like it can't. (I had thought of
>> something along the lines of
>>
>>         li      t0, .L_primary_switched - start
>>         li      t1, XEN_VIRT_START
>>         add     t0, t0, t1
>>
>> or even
>>
>>         li      t1, XEN_VIRT_START
>>         add     t0, t1, %pcrel_lo(.L_primary_switched - start)
>>
>> .)
> Calculation of ".L_primary_switched - start" might be an issue for gcc.
> I tried to do something similar and recieved or "Error: can't resolve
> .L_primary_switched - start" or "Error: illegal operands `li
> t0,.L_primary_switched-start'"

Matches the results of my experiments. If I can find time, I may want
to look into why exactly gas is rejecting such.

Jan
Oleksii Kurochko June 14, 2023, 11:06 a.m. UTC | #6
> > +}
> > +
> > +void __init remove_identity_mapping(void)
> > +{
> > +    int i, j;
> 
> Nit: unsigned int please.
> 
> 
It should be int in the current case because of the 'for' exit
condition:
      for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS - 1; i
>= 0; i-- )

Should exit condition be re-writen?

~ Oleksii
Jan Beulich June 14, 2023, 11:38 a.m. UTC | #7
On 14.06.2023 13:06, Oleksii wrote:
>>> +}
>>> +
>>> +void __init remove_identity_mapping(void)
>>> +{
>>> +    int i, j;
>>
>> Nit: unsigned int please.
>>
>>
> It should be int in the current case because of the 'for' exit
> condition:
>       for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS - 1; i
>> = 0; i-- )
> 
> Should exit condition be re-writen?

Since it easily can be, I think that would be preferable. But in a case
like this, if you think it's better the way you have it, so be it (until
someone comes along and changes it).

Jan
diff mbox series

Patch

diff --git a/xen/arch/riscv/include/asm/mm.h b/xen/arch/riscv/include/asm/mm.h
index 996041ce81..500fdc9c5a 100644
--- a/xen/arch/riscv/include/asm/mm.h
+++ b/xen/arch/riscv/include/asm/mm.h
@@ -9,7 +9,8 @@ 
 void setup_initial_pagetables(void);
 
 void enable_mmu(void);
-void cont_after_mmu_is_enabled(void);
+
+void remove_identity_mapping(void);
 
 void calc_phys_offset(void);
 
diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
index c092897f9a..ab790f571d 100644
--- a/xen/arch/riscv/mm.c
+++ b/xen/arch/riscv/mm.c
@@ -24,6 +24,11 @@  static unsigned long phys_offset;
 #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
 #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
 
+/*
+ * Should be removed as soon as enough headers will be merged for inclusion of
+ * <xen/lib.h>.
+ */
+#define ARRAY_SIZE(arr)		(sizeof(arr) / sizeof((arr)[0]))
 
 /*
  * It is expected that Xen won't be more then 2 MB.
@@ -35,8 +40,10 @@  static unsigned long phys_offset;
  *
  * It might be needed one more page table in case when Xen load address
  * isn't 2 MB aligned.
+ *
+ * 3 additional page tables are needed for identity mapping.
  */
-#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
+#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1 + 3)
 
 pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
 stage1_pgtbl_root[PAGETABLE_ENTRIES];
@@ -75,6 +82,7 @@  static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
     unsigned int index;
     pte_t *pgtbl;
     unsigned long page_addr;
+    bool is_identity_mapping = (map_start == pa_start);
 
     if ( (unsigned long)_start % XEN_PT_LEVEL_SIZE(0) )
     {
@@ -108,16 +116,18 @@  static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
             {
                 unsigned long paddr = (page_addr - map_start) + pa_start;
                 unsigned int permissions = PTE_LEAF_DEFAULT;
+                unsigned long addr = (is_identity_mapping) ?
+                                     page_addr : LINK_TO_LOAD(page_addr);
                 pte_t pte_to_be_written;
 
                 index = pt_index(0, page_addr);
 
-                if ( is_kernel_text(LINK_TO_LOAD(page_addr)) ||
-                     is_kernel_inittext(LINK_TO_LOAD(page_addr)) )
-                    permissions =
-                        PTE_EXECUTABLE | PTE_READABLE | PTE_VALID;
+                if ( is_kernel_text(addr) ||
+                     is_kernel_inittext(addr) )
+                        permissions =
+                            PTE_EXECUTABLE | PTE_READABLE | PTE_VALID;
 
-                if ( is_kernel_rodata(LINK_TO_LOAD(page_addr)) )
+                if ( is_kernel_rodata(addr) )
                     permissions = PTE_READABLE | PTE_VALID;
 
                 pte_to_be_written = paddr_to_pte(paddr, permissions);
@@ -232,22 +242,27 @@  void __init setup_initial_pagetables(void)
                           linker_start,
                           linker_end,
                           load_start);
+
+    if ( linker_start == load_start )
+        return;
+
+    setup_initial_mapping(&mmu_desc,
+                          load_start,
+                          load_start + PAGE_SIZE,
+                          load_start);
+
+    setup_initial_mapping(&mmu_desc,
+                          (unsigned long)cpu0_boot_stack,
+                          (unsigned long)cpu0_boot_stack + PAGE_SIZE,
+                          (unsigned long)cpu0_boot_stack);
 }
 
-void __init noreturn noinline enable_mmu()
+/*
+ * enable_mmu() can't be __init because __init section isn't part of identity
+ * mapping so it will cause an issue after MMU will be enabled.
+ */
+void enable_mmu(void)
 {
-    /*
-     * Calculate a linker time address of the mmu_is_enabled
-     * label and update CSR_STVEC with it.
-     * MMU is configured in a way where linker addresses are mapped
-     * on load addresses so in a case when linker addresses are not equal
-     * to load addresses, after MMU is enabled, it will cause
-     * an exception and jump to linker time addresses.
-     * Otherwise if load addresses are equal to linker addresses the code
-     * after mmu_is_enabled label will be executed without exception.
-     */
-    csr_write(CSR_STVEC, LOAD_TO_LINK((unsigned long)&&mmu_is_enabled));
-
     /* Ensure page table writes precede loading the SATP */
     sfence_vma();
 
@@ -255,25 +270,41 @@  void __init noreturn noinline enable_mmu()
     csr_write(CSR_SATP,
               PFN_DOWN((unsigned long)stage1_pgtbl_root) |
               RV_STAGE1_MODE << SATP_MODE_SHIFT);
+}
+
+void __init remove_identity_mapping(void)
+{
+    int i, j;
+    pte_t *pgtbl;
+    unsigned int index, xen_index;
 
-    asm volatile ( ".p2align 2" );
- mmu_is_enabled:
     /*
-     * Stack should be re-inited as:
-     * 1. Right now an address of the stack is relative to load time
-     *    addresses what will cause an issue in case of load start address
-     *    isn't equal to linker start address.
-     * 2. Addresses in stack are all load time relative which can be an
-     *    issue in case when load start address isn't equal to linker
-     *    start address.
-     *
-     * We can't return to the caller because the stack was reseted
-     * and it may have stash some variable on the stack.
-     * Jump to a brand new function as the stack was reseted
+     * id_addrs should be in sync with id mapping in
+     * setup_initial_pagetables()
      */
+    unsigned long id_addrs[] =  {
+                                 LINK_TO_LOAD(_start),
+                                 LINK_TO_LOAD(cpu0_boot_stack),
+                                };
 
-    switch_stack_and_jump((unsigned long)cpu0_boot_stack + STACK_SIZE,
-                          cont_after_mmu_is_enabled);
+    pgtbl = stage1_pgtbl_root;
+
+    for ( j = 0; j < ARRAY_SIZE(id_addrs); j++ )
+    {
+        for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS - 1; i >= 0; i-- )
+        {
+            index = pt_index(i, id_addrs[j]);
+            xen_index = pt_index(i, XEN_VIRT_START);
+
+            if ( index != xen_index )
+            {
+                pgtbl[index].pte = 0;
+                break;
+            }
+
+            pgtbl = &pgtbl[index];
+        }
+    }
 }
 
 /*
diff --git a/xen/arch/riscv/riscv64/head.S b/xen/arch/riscv/riscv64/head.S
index 69f3a24987..582078798a 100644
--- a/xen/arch/riscv/riscv64/head.S
+++ b/xen/arch/riscv/riscv64/head.S
@@ -31,6 +31,36 @@  ENTRY(start)
 
         jal     calc_phys_offset
 
+        jal     setup_initial_pagetables
+
+        jal     enable_mmu
+
+        /*
+         * Calculate physical offset
+         *
+         * We can't re-use a value in phys_offset variable here as
+         * phys_offset is located in .bss and this section isn't
+         * 1:1 mapped and an access to it will cause MMU fault
+         */
+        li      t0, XEN_VIRT_START
+        la      t1, start
+        sub     t1, t1, t0
+
+        /* Calculate proper VA after jump from 1:1 mapping */
+        la      t0, .L_primary_switched
+        sub     t0, t0, t1
+
+        /* Jump from 1:1 mapping world */
+        jr      t0
+
+.L_primary_switched:
+        /*
+         * cpu0_boot_stack address is 1:1 mapping related so it should be
+         * recalculated after jump from 1:1 mapping world as 1:1 mapping
+         * will be removed soon in start_xen().
+         */
+        jal     reset_stack
+
         tail    start_xen
 
 ENTRY(reset_stack)
diff --git a/xen/arch/riscv/setup.c b/xen/arch/riscv/setup.c
index 845d18d86f..c4ef0b3165 100644
--- a/xen/arch/riscv/setup.c
+++ b/xen/arch/riscv/setup.c
@@ -11,20 +11,10 @@  unsigned char __initdata cpu0_boot_stack[STACK_SIZE]
 void __init noreturn start_xen(unsigned long bootcpu_id,
                                paddr_t dtb_addr)
 {
-    early_printk("Hello from C env\n");
-
-    setup_initial_pagetables();
-
-    enable_mmu();
-
-    for ( ;; )
-        asm volatile ("wfi");
+    remove_identity_mapping();
 
-    unreachable();
-}
+    early_printk("Hello from C env\n");
 
-void __init noreturn cont_after_mmu_is_enabled(void)
-{
     early_printk("All set up\n");
 
     for ( ;; )