diff mbox series

xen: consider alloc-only segments when loading PV dom0 kernel

Message ID 20220623080208.2214-1-jgross@suse.com (mailing list archive)
State New, archived
Headers show
Series xen: consider alloc-only segments when loading PV dom0 kernel | expand

Commit Message

Jürgen Groß June 23, 2022, 8:02 a.m. UTC
When loading the dom0 kernel for PV mode, the first free usable memory
location after the kernel needs to take segments into account, which
have only the ALLOC flag set, but are not specified to be loaded in
the program headers of the ELF file.

This is e.g. a problem for Linux kernels from 5.19 onwards, as those
can have a final NOLOAD section at the end, which must not be used by
e.g. the start_info structure or the initial page tables allocated by
the hypervisor.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/common/libelf/libelf-loader.c | 33 +++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

Comments

Jan Beulich June 23, 2022, 9:04 a.m. UTC | #1
On 23.06.2022 10:02, Juergen Gross wrote:
> When loading the dom0 kernel for PV mode, the first free usable memory
> location after the kernel needs to take segments into account, which
> have only the ALLOC flag set, but are not specified to be loaded in
> the program headers of the ELF file.
> 
> This is e.g. a problem for Linux kernels from 5.19 onwards, as those
> can have a final NOLOAD section at the end, which must not be used by
> e.g. the start_info structure or the initial page tables allocated by
> the hypervisor.
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>
> ---
>  xen/common/libelf/libelf-loader.c | 33 +++++++++++++++++++++++++++++++
>  1 file changed, 33 insertions(+)
> 
> diff --git a/xen/common/libelf/libelf-loader.c b/xen/common/libelf/libelf-loader.c
> index 629cc0d3e6..4b0e3ced55 100644
> --- a/xen/common/libelf/libelf-loader.c
> +++ b/xen/common/libelf/libelf-loader.c
> @@ -467,7 +467,9 @@ do {                                                                \
>  void elf_parse_binary(struct elf_binary *elf)
>  {
>      ELF_HANDLE_DECL(elf_phdr) phdr;
> +    ELF_HANDLE_DECL(elf_shdr) shdr;
>      uint64_t low = -1, high = 0, paddr, memsz;
> +    uint64_t vlow = -1, vhigh = 0, vaddr, voff;
>      unsigned i, count;
>  
>      count = elf_phdr_count(elf);
> @@ -480,6 +482,7 @@ void elf_parse_binary(struct elf_binary *elf)
>          if ( !elf_phdr_is_loadable(elf, phdr) )
>              continue;
>          paddr = elf_uval(elf, phdr, p_paddr);
> +        vaddr = elf_uval(elf, phdr, p_vaddr);
>          memsz = elf_uval(elf, phdr, p_memsz);
>          elf_msg(elf, "ELF: phdr: paddr=%#" PRIx64 " memsz=%#" PRIx64 "\n",
>                  paddr, memsz);
> @@ -487,7 +490,37 @@ void elf_parse_binary(struct elf_binary *elf)
>              low = paddr;
>          if ( high < paddr + memsz )
>              high = paddr + memsz;
> +        if ( vlow > vaddr )
> +            vlow = vaddr;
> +        if ( vhigh < vaddr + memsz )
> +            vhigh = vaddr + memsz;
>      }
> +
> +    voff = vhigh - high;
> +
> +    count = elf_shdr_count(elf);
> +    for ( i = 0; i < count; i++ )
> +    {
> +        shdr = elf_shdr_by_index(elf, i);
> +        if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) )
> +            /* input has an insane section header count field */
> +            break;
> +        if ( !(elf_uval(elf, shdr, sh_flags) & SHF_ALLOC) )
> +            continue;
> +        vaddr = elf_uval(elf, shdr, sh_addr);
> +        memsz = elf_uval(elf, shdr, sh_size);
> +        if ( vlow > vaddr )
> +        {
> +            vlow = vaddr;
> +            low = vaddr - voff;
> +        }
> +        if ( vhigh < vaddr + memsz )
> +        {
> +            vhigh = vaddr + memsz;
> +            high = vaddr + memsz - voff;
> +        }
> +    }

As said in the reply to your problem report: The set of PHDRs doesn't
cover all sections. For loading one should never need to resort to
parsing section headers - in a loadable binary it is no error if
there's no section table in the first place. (The title is also
misleading, as you really mean sections there, not segments. Afaik
there's no concept of "alloc" for segments, which are what program
headers describe.)

Also: Needing to fix this in the hypervisor would mean that Linux
5.19 and onwards cannot be booted on Xen without whichever fix
backported.

Finally, you changing libelf but referring to only Dom0 in the title
looks inconsistent to me.

Jan
Jürgen Groß June 23, 2022, 9:08 a.m. UTC | #2
On 23.06.22 11:04, Jan Beulich wrote:
> On 23.06.2022 10:02, Juergen Gross wrote:
>> When loading the dom0 kernel for PV mode, the first free usable memory
>> location after the kernel needs to take segments into account, which
>> have only the ALLOC flag set, but are not specified to be loaded in
>> the program headers of the ELF file.
>>
>> This is e.g. a problem for Linux kernels from 5.19 onwards, as those
>> can have a final NOLOAD section at the end, which must not be used by
>> e.g. the start_info structure or the initial page tables allocated by
>> the hypervisor.
>>
>> Signed-off-by: Juergen Gross <jgross@suse.com>
>> ---
>>   xen/common/libelf/libelf-loader.c | 33 +++++++++++++++++++++++++++++++
>>   1 file changed, 33 insertions(+)
>>
>> diff --git a/xen/common/libelf/libelf-loader.c b/xen/common/libelf/libelf-loader.c
>> index 629cc0d3e6..4b0e3ced55 100644
>> --- a/xen/common/libelf/libelf-loader.c
>> +++ b/xen/common/libelf/libelf-loader.c
>> @@ -467,7 +467,9 @@ do {                                                                \
>>   void elf_parse_binary(struct elf_binary *elf)
>>   {
>>       ELF_HANDLE_DECL(elf_phdr) phdr;
>> +    ELF_HANDLE_DECL(elf_shdr) shdr;
>>       uint64_t low = -1, high = 0, paddr, memsz;
>> +    uint64_t vlow = -1, vhigh = 0, vaddr, voff;
>>       unsigned i, count;
>>   
>>       count = elf_phdr_count(elf);
>> @@ -480,6 +482,7 @@ void elf_parse_binary(struct elf_binary *elf)
>>           if ( !elf_phdr_is_loadable(elf, phdr) )
>>               continue;
>>           paddr = elf_uval(elf, phdr, p_paddr);
>> +        vaddr = elf_uval(elf, phdr, p_vaddr);
>>           memsz = elf_uval(elf, phdr, p_memsz);
>>           elf_msg(elf, "ELF: phdr: paddr=%#" PRIx64 " memsz=%#" PRIx64 "\n",
>>                   paddr, memsz);
>> @@ -487,7 +490,37 @@ void elf_parse_binary(struct elf_binary *elf)
>>               low = paddr;
>>           if ( high < paddr + memsz )
>>               high = paddr + memsz;
>> +        if ( vlow > vaddr )
>> +            vlow = vaddr;
>> +        if ( vhigh < vaddr + memsz )
>> +            vhigh = vaddr + memsz;
>>       }
>> +
>> +    voff = vhigh - high;
>> +
>> +    count = elf_shdr_count(elf);
>> +    for ( i = 0; i < count; i++ )
>> +    {
>> +        shdr = elf_shdr_by_index(elf, i);
>> +        if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) )
>> +            /* input has an insane section header count field */
>> +            break;
>> +        if ( !(elf_uval(elf, shdr, sh_flags) & SHF_ALLOC) )
>> +            continue;
>> +        vaddr = elf_uval(elf, shdr, sh_addr);
>> +        memsz = elf_uval(elf, shdr, sh_size);
>> +        if ( vlow > vaddr )
>> +        {
>> +            vlow = vaddr;
>> +            low = vaddr - voff;
>> +        }
>> +        if ( vhigh < vaddr + memsz )
>> +        {
>> +            vhigh = vaddr + memsz;
>> +            high = vaddr + memsz - voff;
>> +        }
>> +    }
> 
> As said in the reply to your problem report: The set of PHDRs doesn't
> cover all sections. For loading one should never need to resort to
> parsing section headers - in a loadable binary it is no error if
> there's no section table in the first place. (The title is also

The problem isn't the loading, but the memory usage after doing the
loading. The hypervisor is placing page tables in a memory region
the kernel has other plans with.

> misleading, as you really mean sections there, not segments. Afaik
> there's no concept of "alloc" for segments, which are what program
> headers describe.)

Sorry, will reword.

> Also: Needing to fix this in the hypervisor would mean that Linux
> 5.19 and onwards cannot be booted on Xen without whichever fix
> backported.

Correct. See my reply to the reply you mentioned above.

> Finally, you changing libelf but referring to only Dom0 in the title
> looks inconsistent to me.

Hmm, yes. Will drop the dom0 aspect.


Juergen
Jan Beulich June 23, 2022, 9:14 a.m. UTC | #3
On 23.06.2022 11:08, Juergen Gross wrote:
> On 23.06.22 11:04, Jan Beulich wrote:
>> On 23.06.2022 10:02, Juergen Gross wrote:
>>> When loading the dom0 kernel for PV mode, the first free usable memory
>>> location after the kernel needs to take segments into account, which
>>> have only the ALLOC flag set, but are not specified to be loaded in
>>> the program headers of the ELF file.
>>>
>>> This is e.g. a problem for Linux kernels from 5.19 onwards, as those
>>> can have a final NOLOAD section at the end, which must not be used by
>>> e.g. the start_info structure or the initial page tables allocated by
>>> the hypervisor.
>>>
>>> Signed-off-by: Juergen Gross <jgross@suse.com>
>>> ---
>>>   xen/common/libelf/libelf-loader.c | 33 +++++++++++++++++++++++++++++++
>>>   1 file changed, 33 insertions(+)
>>>
>>> diff --git a/xen/common/libelf/libelf-loader.c b/xen/common/libelf/libelf-loader.c
>>> index 629cc0d3e6..4b0e3ced55 100644
>>> --- a/xen/common/libelf/libelf-loader.c
>>> +++ b/xen/common/libelf/libelf-loader.c
>>> @@ -467,7 +467,9 @@ do {                                                                \
>>>   void elf_parse_binary(struct elf_binary *elf)
>>>   {
>>>       ELF_HANDLE_DECL(elf_phdr) phdr;
>>> +    ELF_HANDLE_DECL(elf_shdr) shdr;
>>>       uint64_t low = -1, high = 0, paddr, memsz;
>>> +    uint64_t vlow = -1, vhigh = 0, vaddr, voff;
>>>       unsigned i, count;
>>>   
>>>       count = elf_phdr_count(elf);
>>> @@ -480,6 +482,7 @@ void elf_parse_binary(struct elf_binary *elf)
>>>           if ( !elf_phdr_is_loadable(elf, phdr) )
>>>               continue;
>>>           paddr = elf_uval(elf, phdr, p_paddr);
>>> +        vaddr = elf_uval(elf, phdr, p_vaddr);
>>>           memsz = elf_uval(elf, phdr, p_memsz);
>>>           elf_msg(elf, "ELF: phdr: paddr=%#" PRIx64 " memsz=%#" PRIx64 "\n",
>>>                   paddr, memsz);
>>> @@ -487,7 +490,37 @@ void elf_parse_binary(struct elf_binary *elf)
>>>               low = paddr;
>>>           if ( high < paddr + memsz )
>>>               high = paddr + memsz;
>>> +        if ( vlow > vaddr )
>>> +            vlow = vaddr;
>>> +        if ( vhigh < vaddr + memsz )
>>> +            vhigh = vaddr + memsz;
>>>       }
>>> +
>>> +    voff = vhigh - high;
>>> +
>>> +    count = elf_shdr_count(elf);
>>> +    for ( i = 0; i < count; i++ )
>>> +    {
>>> +        shdr = elf_shdr_by_index(elf, i);
>>> +        if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) )
>>> +            /* input has an insane section header count field */
>>> +            break;
>>> +        if ( !(elf_uval(elf, shdr, sh_flags) & SHF_ALLOC) )
>>> +            continue;
>>> +        vaddr = elf_uval(elf, shdr, sh_addr);
>>> +        memsz = elf_uval(elf, shdr, sh_size);
>>> +        if ( vlow > vaddr )
>>> +        {
>>> +            vlow = vaddr;
>>> +            low = vaddr - voff;
>>> +        }
>>> +        if ( vhigh < vaddr + memsz )
>>> +        {
>>> +            vhigh = vaddr + memsz;
>>> +            high = vaddr + memsz - voff;
>>> +        }
>>> +    }
>>
>> As said in the reply to your problem report: The set of PHDRs doesn't
>> cover all sections. For loading one should never need to resort to
>> parsing section headers - in a loadable binary it is no error if
>> there's no section table in the first place. (The title is also
> 
> The problem isn't the loading, but the memory usage after doing the
> loading. The hypervisor is placing page tables in a memory region
> the kernel has other plans with.

But part of "loading" is to determine the extent of the binary, which
is what the program headers (and only them) ought to describe. Note
also that our "loading" includes correct handling of .bss-style parts
of segments (i.e. their clearing):

static elf_errorstatus elf_load_image(struct elf_binary *elf, elf_ptrval dst, elf_ptrval src, uint64_t filesz, uint64_t memsz)
{
    elf_errorstatus rc;
    if ( filesz > ULONG_MAX || memsz > ULONG_MAX )
        return -1;
    /* We trust the dom0 kernel image completely, so we don't care
     * about overruns etc. here. */
    rc = elf_memcpy(elf->vcpu, ELF_UNSAFE_PTR(dst), ELF_UNSAFE_PTR(src),
                    filesz);
    if ( rc != 0 )
        return -1;
    rc = elf_memcpy(elf->vcpu, ELF_UNSAFE_PTR(dst + filesz), NULL,
                    memsz - filesz);
    if ( rc != 0 )
        return -1;
    return 0;
}

IOW in principle there's no need for the kernel to clear its .bss
(a 2nd time). Provided, of course, the phdrs properly describe the
entire image.

Jan
diff mbox series

Patch

diff --git a/xen/common/libelf/libelf-loader.c b/xen/common/libelf/libelf-loader.c
index 629cc0d3e6..4b0e3ced55 100644
--- a/xen/common/libelf/libelf-loader.c
+++ b/xen/common/libelf/libelf-loader.c
@@ -467,7 +467,9 @@  do {                                                                \
 void elf_parse_binary(struct elf_binary *elf)
 {
     ELF_HANDLE_DECL(elf_phdr) phdr;
+    ELF_HANDLE_DECL(elf_shdr) shdr;
     uint64_t low = -1, high = 0, paddr, memsz;
+    uint64_t vlow = -1, vhigh = 0, vaddr, voff;
     unsigned i, count;
 
     count = elf_phdr_count(elf);
@@ -480,6 +482,7 @@  void elf_parse_binary(struct elf_binary *elf)
         if ( !elf_phdr_is_loadable(elf, phdr) )
             continue;
         paddr = elf_uval(elf, phdr, p_paddr);
+        vaddr = elf_uval(elf, phdr, p_vaddr);
         memsz = elf_uval(elf, phdr, p_memsz);
         elf_msg(elf, "ELF: phdr: paddr=%#" PRIx64 " memsz=%#" PRIx64 "\n",
                 paddr, memsz);
@@ -487,7 +490,37 @@  void elf_parse_binary(struct elf_binary *elf)
             low = paddr;
         if ( high < paddr + memsz )
             high = paddr + memsz;
+        if ( vlow > vaddr )
+            vlow = vaddr;
+        if ( vhigh < vaddr + memsz )
+            vhigh = vaddr + memsz;
     }
+
+    voff = vhigh - high;
+
+    count = elf_shdr_count(elf);
+    for ( i = 0; i < count; i++ )
+    {
+        shdr = elf_shdr_by_index(elf, i);
+        if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) )
+            /* input has an insane section header count field */
+            break;
+        if ( !(elf_uval(elf, shdr, sh_flags) & SHF_ALLOC) )
+            continue;
+        vaddr = elf_uval(elf, shdr, sh_addr);
+        memsz = elf_uval(elf, shdr, sh_size);
+        if ( vlow > vaddr )
+        {
+            vlow = vaddr;
+            low = vaddr - voff;
+        }
+        if ( vhigh < vaddr + memsz )
+        {
+            vhigh = vaddr + memsz;
+            high = vaddr + memsz - voff;
+        }
+    }
+
     elf->pstart = low;
     elf->pend = high;
     elf_msg(elf, "ELF: memory: %#" PRIx64 " -> %#" PRIx64 "\n",