diff mbox series

[3/3] x86/PVH: Support relocatable dom0 kernels

Message ID 20240306185032.103216-4-jason.andryuk@amd.com (mailing list archive)
State Superseded
Headers show
Series x86/pvh: Support relocating dom0 kernel | expand

Commit Message

Jason Andryuk March 6, 2024, 6:50 p.m. UTC
Xen tries to load a PVH dom0 kernel at the fixed guest physical address
from the elf headers.  For Linux, this defaults to 0x1000000 (16MB), but
it can be configured.

Unfortunately there exist firmwares that have reserved regions at this
address, so Xen fails to load the dom0 kernel since it's not RAM.

The PVH entry code is not relocatable - it loads from absolute
addresses, which fail when the kernel is loaded at a different address.
With a suitably modified kernel, a reloctable entry point is possible.

Add the XENFEAT_pvh_relocatable flag to let a kernel indicate that it
supports a relocatable entry path.

Change the loading to check for an acceptable load address.  If the
kernel is relocatable, support finding an alternate load address.

Linux cares about its physical alignment.  This can be pulled out of the
bzImage header, but not from the vmlinux ELF file.  If an alignment
can't be found, use 2MB.

Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
---
Put alignment as a new ELF note?  Presence of that note would indicate
relocation support without a new XENFEAT flag.

Default alignment to a multiple of 2MB to make more cases work?  It has
to be a power of two, and a multiple might allow loading a customized
kernel.  A larger alignment would limit the number of possible load
locations.
---
 xen/arch/x86/hvm/dom0_build.c | 109 ++++++++++++++++++++++++++++++++++
 xen/include/public/features.h |   5 ++
 2 files changed, 114 insertions(+)

Comments

Stefano Stabellini March 7, 2024, 2:09 a.m. UTC | #1
On Wed, 6 Mar 2024, Jason Andryuk wrote:
> Xen tries to load a PVH dom0 kernel at the fixed guest physical address
> from the elf headers.  For Linux, this defaults to 0x1000000 (16MB), but
> it can be configured.
> 
> Unfortunately there exist firmwares that have reserved regions at this
> address, so Xen fails to load the dom0 kernel since it's not RAM.
> 
> The PVH entry code is not relocatable - it loads from absolute
> addresses, which fail when the kernel is loaded at a different address.
> With a suitably modified kernel, a reloctable entry point is possible.
> 
> Add the XENFEAT_pvh_relocatable flag to let a kernel indicate that it
> supports a relocatable entry path.
> 
> Change the loading to check for an acceptable load address.  If the
> kernel is relocatable, support finding an alternate load address.
> 
> Linux cares about its physical alignment.  This can be pulled out of the
> bzImage header, but not from the vmlinux ELF file.  If an alignment
> can't be found, use 2MB.
> 
> Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
> ---
> Put alignment as a new ELF note?  Presence of that note would indicate
> relocation support without a new XENFEAT flag.
> 
> Default alignment to a multiple of 2MB to make more cases work?  It has
> to be a power of two, and a multiple might allow loading a customized
> kernel.  A larger alignment would limit the number of possible load
> locations.
> ---
>  xen/arch/x86/hvm/dom0_build.c | 109 ++++++++++++++++++++++++++++++++++
>  xen/include/public/features.h |   5 ++
>  2 files changed, 114 insertions(+)
> 
> diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
> index bbae8a5645..34d68ee8fb 100644
> --- a/xen/arch/x86/hvm/dom0_build.c
> +++ b/xen/arch/x86/hvm/dom0_build.c
> @@ -537,6 +537,109 @@ static paddr_t __init find_memory(
>      return INVALID_PADDR;
>  }
>  
> +static bool __init check_load_address(
> +    const struct domain *d, const struct elf_binary *elf)
> +{
> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
> +                                 PAGE_SIZE);
> +    unsigned int i;
> +
> +    /*
> +     * The memory map is sorted and all RAM regions starts and sizes are
> +     * aligned to page boundaries.
> +     */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        paddr_t start = d->arch.e820[i].addr;
> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
> +
> +        if ( start <= kernel_start &&
> +             end >= kernel_end &&
> +             d->arch.e820[i].type == E820_RAM )
> +            return true;
> +    }
> +
> +    return false;
> +}
> +
> +/*
> + * Find an e820 RAM region that fits the kernel at a suitable alignment.
> + */
> +static paddr_t find_kernel_memory(
> +    const struct domain *d, struct elf_binary *elf, paddr_t align)
> +{
> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
> +                                 PAGE_SIZE);
> +    unsigned int i;
> +
> +    /*
> +     * The memory map is sorted and all RAM regions starts and sizes are
> +     * aligned to page boundaries.
> +     */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        paddr_t start = d->arch.e820[i].addr;
> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
> +        paddr_t kstart, kend, offset;
> +
> +        if ( d->arch.e820[i].type != E820_RAM )
> +            continue;
> +
> +        if ( d->arch.e820[i].size < elf->dest_size )
> +            continue;
> +
> +        if ( end < kernel_end )
> +            continue;

Why this check? Is it to make sure we look for e820 regions that are
higher in terms of addresses? If so, couldn't we start from
d->arch.nr_e820 and go down instead of starting from 0 and going up?

The PVH entry point is a 32-bit entry point if I remember right? Do we
need a 32-bit check? If so then it might not be a good idea to start
from arch.nr_e820 and go down.



> +        kstart = ROUNDUP(start, align);
> +        offset = kstart - kernel_start;
> +        kend = kernel_end + offset;
> +
> +        if ( kend <= end )
> +            return offset;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Check the kernel load address, and adjust if necessary and possible.
> + */
> +static bool __init adjust_load_address(
> +    const struct domain *d, struct elf_binary *elf, struct elf_dom_parms *parms,
> +    paddr_t align)
> +{
> +    paddr_t offset;
> +
> +    /* Check load address */
> +    if ( check_load_address(d, elf) )
> +        return true;
> +
> +    if ( !test_bit(XENFEAT_pvh_relocatable, parms->f_supported) )
> +    {
> +        printk("Address conflict and %pd kernel is not relocatable\n", d);
> +        return false;
> +    }
> +
> +    if ( align == 0 )
> +        align = MB(2);
> +
> +    offset = find_kernel_memory(d, elf, align);
> +    if ( offset == 0 )
> +    {
> +        printk("Failed find a load offset for the kernel\n");
> +        return false;
> +    }
> +
> +    printk("Adjusting load address by %#lx\n", offset);
> +    elf->dest_base += offset;
> +    parms->phys_entry += offset;
> +
> +    return true;
> +}
> +
>  static int __init pvh_load_kernel(struct domain *d, const module_t *image,
>                                    unsigned long image_headroom,
>                                    module_t *initrd, void *image_base,
> @@ -587,6 +690,12 @@ static int __init pvh_load_kernel(struct domain *d, const module_t *image,
>      elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base);
>      elf.dest_size = parms.virt_kend - parms.virt_kstart;
>  
> +    if ( !adjust_load_address(d, &elf, &parms, align) )
> +    {
> +        printk("Unable to load kernel\n");
> +        return -ENOMEM;
> +    }
> +
>      elf_set_vcpu(&elf, v);
>      rc = elf_load_binary(&elf);
>      if ( rc < 0 )
> diff --git a/xen/include/public/features.h b/xen/include/public/features.h
> index 4437f25d25..300480cb22 100644
> --- a/xen/include/public/features.h
> +++ b/xen/include/public/features.h
> @@ -120,6 +120,11 @@
>  #define XENFEAT_runstate_phys_area        18
>  #define XENFEAT_vcpu_time_phys_area       19
>  
> +/*
> + * PVH: If set, the guest supports relocation in load address.
> + */
> +#define XENFEAT_pvh_relocatable           20
> +
>  #define XENFEAT_NR_SUBMAPS 1
>  
>  #endif /* __XEN_PUBLIC_FEATURES_H__ */
> -- 
> 2.44.0
> 
>
Roger Pau Monné March 7, 2024, 9:30 a.m. UTC | #2
On Wed, Mar 06, 2024 at 01:50:32PM -0500, Jason Andryuk wrote:
> Xen tries to load a PVH dom0 kernel at the fixed guest physical address
> from the elf headers.  For Linux, this defaults to 0x1000000 (16MB), but
> it can be configured.
> 
> Unfortunately there exist firmwares that have reserved regions at this
> address, so Xen fails to load the dom0 kernel since it's not RAM.
> 
> The PVH entry code is not relocatable - it loads from absolute
> addresses, which fail when the kernel is loaded at a different address.
> With a suitably modified kernel, a reloctable entry point is possible.
> 
> Add the XENFEAT_pvh_relocatable flag to let a kernel indicate that it
> supports a relocatable entry path.
> 
> Change the loading to check for an acceptable load address.  If the
> kernel is relocatable, support finding an alternate load address.
> 
> Linux cares about its physical alignment.  This can be pulled out of the
> bzImage header, but not from the vmlinux ELF file.  If an alignment
> can't be found, use 2MB.

While I'm fine with having a Linux specific way, there needs to be a
generic way of passing the alignment for non-bzImage kernels.

ELF program headers have an align field, would that be suitable to
use?

In elf_parse_binary() where the p{start,end} is calculated, you could
also fetch the p_offset from the lower found program header and use it
as the required alignment.  Would that be OK for Linux and maybe avoid
having to fiddle with the bzImage header?  FWIW it is likely fine for
FreeBSD.

> 
> Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>

I created a gitlab ticket for this:

https://gitlab.com/xen-project/xen/-/issues/180

If you want to reference it.

> ---
> Put alignment as a new ELF note?  Presence of that note would indicate
> relocation support without a new XENFEAT flag.
> 
> Default alignment to a multiple of 2MB to make more cases work?  It has
> to be a power of two, and a multiple might allow loading a customized
> kernel.  A larger alignment would limit the number of possible load
> locations.
> ---
>  xen/arch/x86/hvm/dom0_build.c | 109 ++++++++++++++++++++++++++++++++++
>  xen/include/public/features.h |   5 ++
>  2 files changed, 114 insertions(+)
> 
> diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
> index bbae8a5645..34d68ee8fb 100644
> --- a/xen/arch/x86/hvm/dom0_build.c
> +++ b/xen/arch/x86/hvm/dom0_build.c
> @@ -537,6 +537,109 @@ static paddr_t __init find_memory(
>      return INVALID_PADDR;
>  }
>  
> +static bool __init check_load_address(
> +    const struct domain *d, const struct elf_binary *elf)
> +{
> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
> +                                 PAGE_SIZE);

You can use PAGE_ALIGN() here (and below) for simplicity.

> +    unsigned int i;
> +
> +    /*
> +     * The memory map is sorted and all RAM regions starts and sizes are
> +     * aligned to page boundaries.
> +     */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        paddr_t start = d->arch.e820[i].addr;
> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;

Since the memory map is sorted you can end the loop once end start >=
kernel_end?  As further regions are past the kernel destination.

> +
> +        if ( start <= kernel_start &&
> +             end >= kernel_end &&
> +             d->arch.e820[i].type == E820_RAM )
> +            return true;
> +    }
> +
> +    return false;
> +}
> +
> +/*
> + * Find an e820 RAM region that fits the kernel at a suitable alignment.
> + */

This (and other) comment seems to fit in a single line: /* ... */.

> +static paddr_t find_kernel_memory(
> +    const struct domain *d, struct elf_binary *elf, paddr_t align)

elf can be const AFAICT.

> +{
> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
> +                                 PAGE_SIZE);
> +    unsigned int i;
> +
> +    /*
> +     * The memory map is sorted and all RAM regions starts and sizes are
> +     * aligned to page boundaries.
> +     */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        paddr_t start = d->arch.e820[i].addr;
> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
> +        paddr_t kstart, kend, offset;
> +
> +        if ( d->arch.e820[i].type != E820_RAM )
> +            continue;
> +
> +        if ( d->arch.e820[i].size < elf->dest_size )
> +            continue;
> +
> +        if ( end < kernel_end )
> +            continue;

I'm not sure I understand this check, why would we refuse regions
below the fixed kernel end?  Those should be equally suitable if big
enough and meeting the alignment requirements.

> +
> +        kstart = ROUNDUP(start, align);
> +        offset = kstart - kernel_start;
> +        kend = kernel_end + offset;
> +
> +        if ( kend <= end )
> +            return offset;

Why not return this as an address to use to load the kernel instead of
an offset from dest_base?  That would also make the calculations
easier IMO.

> +    }

This should be limited to a range below 4GB.

> +    return 0;
> +}
> +
> +/*
> + * Check the kernel load address, and adjust if necessary and possible.
> + */
> +static bool __init adjust_load_address(
> +    const struct domain *d, struct elf_binary *elf, struct elf_dom_parms *parms,
> +    paddr_t align)
> +{
> +    paddr_t offset;
> +
> +    /* Check load address */
> +    if ( check_load_address(d, elf) )
> +        return true;
> +
> +    if ( !test_bit(XENFEAT_pvh_relocatable, parms->f_supported) )
> +    {
> +        printk("Address conflict and %pd kernel is not relocatable\n", d);
> +        return false;
> +    }
> +
> +    if ( align == 0 )
> +        align = MB(2);
> +
> +    offset = find_kernel_memory(d, elf, align);
> +    if ( offset == 0 )
> +    {
> +        printk("Failed find a load offset for the kernel\n");
> +        return false;
> +    }
> +
> +    printk("Adjusting load address by %#lx\n", offset);

I think this would be more helpful if the previous and the new ranges
are printed, as I'm not sure the previous dest_base is printed, in
which case the offset doesn't help much.  I would do:

if ( opt_dom0_verbose )
    printk("relocating kernel from [%lx, %lx] -> [%lx, %lx]\n", ...);

> +    elf->dest_base += offset;
> +    parms->phys_entry += offset;

As noted above, I think it would be better if find_kernel_memory()
find an absolute address which is then adjusted here.

> +
> +    return true;
> +}
> +
>  static int __init pvh_load_kernel(struct domain *d, const module_t *image,
>                                    unsigned long image_headroom,
>                                    module_t *initrd, void *image_base,
> @@ -587,6 +690,12 @@ static int __init pvh_load_kernel(struct domain *d, const module_t *image,
>      elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base);
>      elf.dest_size = parms.virt_kend - parms.virt_kstart;
>  
> +    if ( !adjust_load_address(d, &elf, &parms, align) )

check_and_adjust_?  As the address is not unconditionally adjusted.

Thanks, Roger.
Jason Andryuk March 7, 2024, 4:07 p.m. UTC | #3
On 2024-03-06 21:09, Stefano Stabellini wrote:
> On Wed, 6 Mar 2024, Jason Andryuk wrote:
>> Xen tries to load a PVH dom0 kernel at the fixed guest physical address
>> from the elf headers.  For Linux, this defaults to 0x1000000 (16MB), but
>> it can be configured.
>>
>> Unfortunately there exist firmwares that have reserved regions at this
>> address, so Xen fails to load the dom0 kernel since it's not RAM.
>>
>> The PVH entry code is not relocatable - it loads from absolute
>> addresses, which fail when the kernel is loaded at a different address.
>> With a suitably modified kernel, a reloctable entry point is possible.
>>
>> Add the XENFEAT_pvh_relocatable flag to let a kernel indicate that it
>> supports a relocatable entry path.
>>
>> Change the loading to check for an acceptable load address.  If the
>> kernel is relocatable, support finding an alternate load address.
>>
>> Linux cares about its physical alignment.  This can be pulled out of the
>> bzImage header, but not from the vmlinux ELF file.  If an alignment
>> can't be found, use 2MB.
>>
>> Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
>> ---
>> Put alignment as a new ELF note?  Presence of that note would indicate
>> relocation support without a new XENFEAT flag.
>>
>> Default alignment to a multiple of 2MB to make more cases work?  It has
>> to be a power of two, and a multiple might allow loading a customized
>> kernel.  A larger alignment would limit the number of possible load
>> locations.
>> ---
>>   xen/arch/x86/hvm/dom0_build.c | 109 ++++++++++++++++++++++++++++++++++
>>   xen/include/public/features.h |   5 ++
>>   2 files changed, 114 insertions(+)
>>
>> diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
>> index bbae8a5645..34d68ee8fb 100644
>> --- a/xen/arch/x86/hvm/dom0_build.c
>> +++ b/xen/arch/x86/hvm/dom0_build.c
>> @@ -537,6 +537,109 @@ static paddr_t __init find_memory(
>>       return INVALID_PADDR;
>>   }
>>   
>> +static bool __init check_load_address(
>> +    const struct domain *d, const struct elf_binary *elf)
>> +{
>> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
>> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
>> +                                 PAGE_SIZE);
>> +    unsigned int i;
>> +
>> +    /*
>> +     * The memory map is sorted and all RAM regions starts and sizes are
>> +     * aligned to page boundaries.
>> +     */
>> +    for ( i = 0; i < d->arch.nr_e820; i++ )
>> +    {
>> +        paddr_t start = d->arch.e820[i].addr;
>> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
>> +
>> +        if ( start <= kernel_start &&
>> +             end >= kernel_end &&
>> +             d->arch.e820[i].type == E820_RAM )
>> +            return true;
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +/*
>> + * Find an e820 RAM region that fits the kernel at a suitable alignment.
>> + */
>> +static paddr_t find_kernel_memory(
>> +    const struct domain *d, struct elf_binary *elf, paddr_t align)
>> +{
>> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
>> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
>> +                                 PAGE_SIZE);
>> +    unsigned int i;
>> +
>> +    /*
>> +     * The memory map is sorted and all RAM regions starts and sizes are
>> +     * aligned to page boundaries.
>> +     */
>> +    for ( i = 0; i < d->arch.nr_e820; i++ )
>> +    {
>> +        paddr_t start = d->arch.e820[i].addr;
>> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
>> +        paddr_t kstart, kend, offset;
>> +
>> +        if ( d->arch.e820[i].type != E820_RAM )
>> +            continue;
>> +
>> +        if ( d->arch.e820[i].size < elf->dest_size )
>> +            continue;
>> +
>> +        if ( end < kernel_end )
>> +            continue;
> 
> Why this check? Is it to make sure we look for e820 regions that are
> higher in terms of addresses? If so, couldn't we start from
> d->arch.nr_e820 and go down instead of starting from 0 and going up?

Yes, I thought we only wanted a higher address.  The Linux bzImage entry 
code uses the LOAD_PHYSICAL_ADDR (CONFIG_PHYSICAL_START/elf->dest_base) 
as a minimum when extracting vmlinux for a relocatable kernel.  I'm not 
sure if that is strictly required though.

I also thought a smaller adjustment would be better, so starting from 
the lower e820 entries would find the first acceptable one.  But that 
may not matter.

> The PVH entry point is a 32-bit entry point if I remember right? Do we
> need a 32-bit check? If so then it might not be a good idea to start
> from arch.nr_e820 and go down.

Yes, the entry point is 32-bit, so you are correct that that the range 
should to be limited at 4GB.

Regards,
Jason
Jason Andryuk March 7, 2024, 5:01 p.m. UTC | #4
On 2024-03-07 04:30, Roger Pau Monné wrote:
> On Wed, Mar 06, 2024 at 01:50:32PM -0500, Jason Andryuk wrote:
>> Xen tries to load a PVH dom0 kernel at the fixed guest physical address
>> from the elf headers.  For Linux, this defaults to 0x1000000 (16MB), but
>> it can be configured.
>>
>> Unfortunately there exist firmwares that have reserved regions at this
>> address, so Xen fails to load the dom0 kernel since it's not RAM.
>>
>> The PVH entry code is not relocatable - it loads from absolute
>> addresses, which fail when the kernel is loaded at a different address.
>> With a suitably modified kernel, a reloctable entry point is possible.
>>
>> Add the XENFEAT_pvh_relocatable flag to let a kernel indicate that it
>> supports a relocatable entry path.
>>
>> Change the loading to check for an acceptable load address.  If the
>> kernel is relocatable, support finding an alternate load address.
>>
>> Linux cares about its physical alignment.  This can be pulled out of the
>> bzImage header, but not from the vmlinux ELF file.  If an alignment
>> can't be found, use 2MB.
> 
> While I'm fine with having a Linux specific way, there needs to be a
> generic way of passing the alignment for non-bzImage kernels.
> 
> ELF program headers have an align field, would that be suitable to
> use?

Unfortunately, it doesn't seem correct.  Linux has 
CONFIG_PHYSICAL_ALIGN, and it doesn't seem to be used in the elf 
headers.  As a quick test, I set CONFIG_PHYSICAL_ALIGN=0x800000, but the 
elf align values are still 0x200000.

> In elf_parse_binary() where the p{start,end} is calculated, you could
> also fetch the p_offset from the lower found program header and use it
> as the required alignment.  Would that be OK for Linux and maybe avoid
> having to fiddle with the bzImage header?  FWIW it is likely fine for
> FreeBSD.

Adding an explicit value in an elf note would avoid any ambiguity.

>>
>> Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
> 
> I created a gitlab ticket for this:
> 
> https://gitlab.com/xen-project/xen/-/issues/180
> 
> If you want to reference it.

Will do

>> ---
>> Put alignment as a new ELF note?  Presence of that note would indicate
>> relocation support without a new XENFEAT flag.
>>
>> Default alignment to a multiple of 2MB to make more cases work?  It has
>> to be a power of two, and a multiple might allow loading a customized
>> kernel.  A larger alignment would limit the number of possible load
>> locations.
>> ---
>>   xen/arch/x86/hvm/dom0_build.c | 109 ++++++++++++++++++++++++++++++++++
>>   xen/include/public/features.h |   5 ++
>>   2 files changed, 114 insertions(+)
>>
>> diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
>> index bbae8a5645..34d68ee8fb 100644
>> --- a/xen/arch/x86/hvm/dom0_build.c
>> +++ b/xen/arch/x86/hvm/dom0_build.c
>> @@ -537,6 +537,109 @@ static paddr_t __init find_memory(
>>       return INVALID_PADDR;
>>   }
>>   
>> +static bool __init check_load_address(
>> +    const struct domain *d, const struct elf_binary *elf)
>> +{
>> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
>> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
>> +                                 PAGE_SIZE);
> 
> You can use PAGE_ALIGN() here (and below) for simplicity.

Ok

>> +    unsigned int i;
>> +
>> +    /*
>> +     * The memory map is sorted and all RAM regions starts and sizes are
>> +     * aligned to page boundaries.
>> +     */
>> +    for ( i = 0; i < d->arch.nr_e820; i++ )
>> +    {
>> +        paddr_t start = d->arch.e820[i].addr;
>> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
> 
> Since the memory map is sorted you can end the loop once end start >=
> kernel_end?  As further regions are past the kernel destination.

Yes, thanks.

>> +
>> +        if ( start <= kernel_start &&
>> +             end >= kernel_end &&
>> +             d->arch.e820[i].type == E820_RAM )
>> +            return true;
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +/*
>> + * Find an e820 RAM region that fits the kernel at a suitable alignment.
>> + */
> 
> This (and other) comment seems to fit in a single line: /* ... */.

Ok

>> +static paddr_t find_kernel_memory(
>> +    const struct domain *d, struct elf_binary *elf, paddr_t align)
> 
> elf can be const AFAICT.

Ok

>> +{
>> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
>> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
>> +                                 PAGE_SIZE);
>> +    unsigned int i;
>> +
>> +    /*
>> +     * The memory map is sorted and all RAM regions starts and sizes are
>> +     * aligned to page boundaries.
>> +     */
>> +    for ( i = 0; i < d->arch.nr_e820; i++ )
>> +    {
>> +        paddr_t start = d->arch.e820[i].addr;
>> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
>> +        paddr_t kstart, kend, offset;
>> +
>> +        if ( d->arch.e820[i].type != E820_RAM )
>> +            continue;
>> +
>> +        if ( d->arch.e820[i].size < elf->dest_size )
>> +            continue;
>> +
>> +        if ( end < kernel_end )
>> +            continue;
> 
> I'm not sure I understand this check, why would we refuse regions
> below the fixed kernel end?  Those should be equally suitable if big
> enough and meeting the alignment requirements.

I thought (Linux at least) wouldn't want to be moved lower - only higher 
in ram.  There was some Linux boot code that gave me that impression, 
but it may not be strictly true.

>> +
>> +        kstart = ROUNDUP(start, align);
>> +        offset = kstart - kernel_start;
>> +        kend = kernel_end + offset;
>> +
>> +        if ( kend <= end )
>> +            return offset;
> 
> Why not return this as an address to use to load the kernel instead of
> an offset from dest_base?  That would also make the calculations
> easier IMO.

phys_entry needs to be updated as well as dest_base, so returning the 
offset seemed more useful.

>> +    }
> 
> This should be limited to a range below 4GB.

Yes.

>> +    return 0;
>> +}
>> +
>> +/*
>> + * Check the kernel load address, and adjust if necessary and possible.
>> + */
>> +static bool __init adjust_load_address(
>> +    const struct domain *d, struct elf_binary *elf, struct elf_dom_parms *parms,
>> +    paddr_t align)
>> +{
>> +    paddr_t offset;
>> +
>> +    /* Check load address */
>> +    if ( check_load_address(d, elf) )
>> +        return true;
>> +
>> +    if ( !test_bit(XENFEAT_pvh_relocatable, parms->f_supported) )
>> +    {
>> +        printk("Address conflict and %pd kernel is not relocatable\n", d);
>> +        return false;
>> +    }
>> +
>> +    if ( align == 0 )
>> +        align = MB(2);
>> +
>> +    offset = find_kernel_memory(d, elf, align);
>> +    if ( offset == 0 )
>> +    {
>> +        printk("Failed find a load offset for the kernel\n");
>> +        return false;
>> +    }
>> +
>> +    printk("Adjusting load address by %#lx\n", offset);
> 
> I think this would be more helpful if the previous and the new ranges
> are printed, as I'm not sure the previous dest_base is printed, in
> which case the offset doesn't help much.  I would do:
> 
> if ( opt_dom0_verbose )
>      printk("relocating kernel from [%lx, %lx] -> [%lx, %lx]\n", ...);

dest_base was printed (maybe with extra verbosity), but this message is 
clearer since it shows the end result.

>> +    elf->dest_base += offset;
>> +    parms->phys_entry += offset;
> 
> As noted above, I think it would be better if find_kernel_memory()
> find an absolute address which is then adjusted here.

I thought the modification with just offset was clearer compared to:

params->phys_entry += (reloc_base - elf->dest_base);
elf->dest_base = reloc_base;

But I'm fine making the change.

>> +
>> +    return true;
>> +}
>> +
>>   static int __init pvh_load_kernel(struct domain *d, const module_t *image,
>>                                     unsigned long image_headroom,
>>                                     module_t *initrd, void *image_base,
>> @@ -587,6 +690,12 @@ static int __init pvh_load_kernel(struct domain *d, const module_t *image,
>>       elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base);
>>       elf.dest_size = parms.virt_kend - parms.virt_kstart;
>>   
>> +    if ( !adjust_load_address(d, &elf, &parms, align) )
> 
> check_and_adjust_?  As the address is not unconditionally adjusted.

I thought it was a little too wordy, but it is more accurate.  I'll 
rename it.

Thanks for taking a look.

-Jason
Jürgen Groß March 8, 2024, 6:34 a.m. UTC | #5
On 07.03.24 18:01, Jason Andryuk wrote:
> On 2024-03-07 04:30, Roger Pau Monné wrote:
>> On Wed, Mar 06, 2024 at 01:50:32PM -0500, Jason Andryuk wrote:
>>> Xen tries to load a PVH dom0 kernel at the fixed guest physical address
>>> from the elf headers.  For Linux, this defaults to 0x1000000 (16MB), but
>>> it can be configured.
>>>
>>> Unfortunately there exist firmwares that have reserved regions at this
>>> address, so Xen fails to load the dom0 kernel since it's not RAM.
>>>
>>> The PVH entry code is not relocatable - it loads from absolute
>>> addresses, which fail when the kernel is loaded at a different address.
>>> With a suitably modified kernel, a reloctable entry point is possible.
>>>
>>> Add the XENFEAT_pvh_relocatable flag to let a kernel indicate that it
>>> supports a relocatable entry path.
>>>
>>> Change the loading to check for an acceptable load address.  If the
>>> kernel is relocatable, support finding an alternate load address.
>>>
>>> Linux cares about its physical alignment.  This can be pulled out of the
>>> bzImage header, but not from the vmlinux ELF file.  If an alignment
>>> can't be found, use 2MB.
>>
>> While I'm fine with having a Linux specific way, there needs to be a
>> generic way of passing the alignment for non-bzImage kernels.
>>
>> ELF program headers have an align field, would that be suitable to
>> use?
> 
> Unfortunately, it doesn't seem correct.  Linux has CONFIG_PHYSICAL_ALIGN, and it 
> doesn't seem to be used in the elf headers.  As a quick test, I set 
> CONFIG_PHYSICAL_ALIGN=0x800000, but the elf align values are still 0x200000.

An excerpt from the kernel's arch/x86/Makefile:

#
# The 64-bit kernel must be aligned to 2MB.  Pass -z max-page-size=0x200000 to
# the linker to force 2MB page size regardless of the default page size used
# by the linker.
#
ifdef CONFIG_X86_64
LDFLAGS_vmlinux += -z max-page-size=0x200000
endif


Juergen
Jan Beulich March 11, 2024, 4:53 p.m. UTC | #6
On 06.03.2024 19:50, Jason Andryuk wrote:
> --- a/xen/arch/x86/hvm/dom0_build.c
> +++ b/xen/arch/x86/hvm/dom0_build.c
> @@ -537,6 +537,109 @@ static paddr_t __init find_memory(
>      return INVALID_PADDR;
>  }
>  
> +static bool __init check_load_address(
> +    const struct domain *d, const struct elf_binary *elf)
> +{
> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
> +                                 PAGE_SIZE);
> +    unsigned int i;
> +
> +    /*
> +     * The memory map is sorted and all RAM regions starts and sizes are
> +     * aligned to page boundaries.
> +     */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        paddr_t start = d->arch.e820[i].addr;
> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
> +
> +        if ( start <= kernel_start &&
> +             end >= kernel_end &&
> +             d->arch.e820[i].type == E820_RAM )
> +            return true;
> +    }
> +
> +    return false;
> +}
> +
> +/*
> + * Find an e820 RAM region that fits the kernel at a suitable alignment.
> + */
> +static paddr_t find_kernel_memory(

__init ?

Jan
Jason Andryuk March 11, 2024, 7:53 p.m. UTC | #7
On 2024-03-11 12:53, Jan Beulich wrote:
> On 06.03.2024 19:50, Jason Andryuk wrote:
>> --- a/xen/arch/x86/hvm/dom0_build.c
>> +++ b/xen/arch/x86/hvm/dom0_build.c
>> @@ -537,6 +537,109 @@ static paddr_t __init find_memory(
>>       return INVALID_PADDR;
>>   }
>>   
>> +static bool __init check_load_address(
>> +    const struct domain *d, const struct elf_binary *elf)
>> +{
>> +    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
>> +    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
>> +                                 PAGE_SIZE);
>> +    unsigned int i;
>> +
>> +    /*
>> +     * The memory map is sorted and all RAM regions starts and sizes are
>> +     * aligned to page boundaries.
>> +     */
>> +    for ( i = 0; i < d->arch.nr_e820; i++ )
>> +    {
>> +        paddr_t start = d->arch.e820[i].addr;
>> +        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
>> +
>> +        if ( start <= kernel_start &&
>> +             end >= kernel_end &&
>> +             d->arch.e820[i].type == E820_RAM )
>> +            return true;
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +/*
>> + * Find an e820 RAM region that fits the kernel at a suitable alignment.
>> + */
>> +static paddr_t find_kernel_memory(
> 
> __init ?

Yes, thanks for catching that.

Regards,
Jason
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index bbae8a5645..34d68ee8fb 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -537,6 +537,109 @@  static paddr_t __init find_memory(
     return INVALID_PADDR;
 }
 
+static bool __init check_load_address(
+    const struct domain *d, const struct elf_binary *elf)
+{
+    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
+    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
+                                 PAGE_SIZE);
+    unsigned int i;
+
+    /*
+     * The memory map is sorted and all RAM regions starts and sizes are
+     * aligned to page boundaries.
+     */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        paddr_t start = d->arch.e820[i].addr;
+        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
+
+        if ( start <= kernel_start &&
+             end >= kernel_end &&
+             d->arch.e820[i].type == E820_RAM )
+            return true;
+    }
+
+    return false;
+}
+
+/*
+ * Find an e820 RAM region that fits the kernel at a suitable alignment.
+ */
+static paddr_t find_kernel_memory(
+    const struct domain *d, struct elf_binary *elf, paddr_t align)
+{
+    paddr_t kernel_start = (paddr_t)elf->dest_base & PAGE_MASK;
+    paddr_t kernel_end = ROUNDUP((paddr_t)elf->dest_base + elf->dest_size,
+                                 PAGE_SIZE);
+    unsigned int i;
+
+    /*
+     * The memory map is sorted and all RAM regions starts and sizes are
+     * aligned to page boundaries.
+     */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        paddr_t start = d->arch.e820[i].addr;
+        paddr_t end = d->arch.e820[i].addr + d->arch.e820[i].size;
+        paddr_t kstart, kend, offset;
+
+        if ( d->arch.e820[i].type != E820_RAM )
+            continue;
+
+        if ( d->arch.e820[i].size < elf->dest_size )
+            continue;
+
+        if ( end < kernel_end )
+            continue;
+
+        kstart = ROUNDUP(start, align);
+        offset = kstart - kernel_start;
+        kend = kernel_end + offset;
+
+        if ( kend <= end )
+            return offset;
+    }
+
+    return 0;
+}
+
+/*
+ * Check the kernel load address, and adjust if necessary and possible.
+ */
+static bool __init adjust_load_address(
+    const struct domain *d, struct elf_binary *elf, struct elf_dom_parms *parms,
+    paddr_t align)
+{
+    paddr_t offset;
+
+    /* Check load address */
+    if ( check_load_address(d, elf) )
+        return true;
+
+    if ( !test_bit(XENFEAT_pvh_relocatable, parms->f_supported) )
+    {
+        printk("Address conflict and %pd kernel is not relocatable\n", d);
+        return false;
+    }
+
+    if ( align == 0 )
+        align = MB(2);
+
+    offset = find_kernel_memory(d, elf, align);
+    if ( offset == 0 )
+    {
+        printk("Failed find a load offset for the kernel\n");
+        return false;
+    }
+
+    printk("Adjusting load address by %#lx\n", offset);
+    elf->dest_base += offset;
+    parms->phys_entry += offset;
+
+    return true;
+}
+
 static int __init pvh_load_kernel(struct domain *d, const module_t *image,
                                   unsigned long image_headroom,
                                   module_t *initrd, void *image_base,
@@ -587,6 +690,12 @@  static int __init pvh_load_kernel(struct domain *d, const module_t *image,
     elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base);
     elf.dest_size = parms.virt_kend - parms.virt_kstart;
 
+    if ( !adjust_load_address(d, &elf, &parms, align) )
+    {
+        printk("Unable to load kernel\n");
+        return -ENOMEM;
+    }
+
     elf_set_vcpu(&elf, v);
     rc = elf_load_binary(&elf);
     if ( rc < 0 )
diff --git a/xen/include/public/features.h b/xen/include/public/features.h
index 4437f25d25..300480cb22 100644
--- a/xen/include/public/features.h
+++ b/xen/include/public/features.h
@@ -120,6 +120,11 @@ 
 #define XENFEAT_runstate_phys_area        18
 #define XENFEAT_vcpu_time_phys_area       19
 
+/*
+ * PVH: If set, the guest supports relocation in load address.
+ */
+#define XENFEAT_pvh_relocatable           20
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */