diff mbox series

[v4] EFI: free unused boot mem in at least some cases

Message ID d8b1bcc8-ffcc-f7fe-b4ad-ce7dcdaed491@suse.com (mailing list archive)
State New, archived
Headers show
Series [v4] EFI: free unused boot mem in at least some cases | expand

Commit Message

Jan Beulich Sept. 16, 2020, 12:20 p.m. UTC
Address at least the primary reason why 52bba67f8b87 ("efi/boot: Don't
free ebmalloc area at all") was put in place: Make xen_in_range() aware
of the freed range. This is in particular relevant for EFI-enabled
builds not actually running on EFI, as the entire range will be unused
in this case.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Address PV shim breakage (stub function also needed adjustment).
v3: Don't free the memory twice.
v2: Also adjust the two places where comments point out that they need
    to remain in sync with xen_in_range(). Add assertions to
    xen_in_range().
---
The remaining issue could be addressed too, by making the area 2M in
size and 2M-aligned.

Comments

Roger Pau Monne Sept. 17, 2020, 10:45 a.m. UTC | #1
On Wed, Sep 16, 2020 at 02:20:54PM +0200, Jan Beulich wrote:
> Address at least the primary reason why 52bba67f8b87 ("efi/boot: Don't
> free ebmalloc area at all") was put in place: Make xen_in_range() aware
> of the freed range. This is in particular relevant for EFI-enabled
> builds not actually running on EFI, as the entire range will be unused
> in this case.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v4: Address PV shim breakage (stub function also needed adjustment).
> v3: Don't free the memory twice.
> v2: Also adjust the two places where comments point out that they need
>     to remain in sync with xen_in_range(). Add assertions to
>     xen_in_range().
> ---
> The remaining issue could be addressed too, by making the area 2M in
> size and 2M-aligned.
> 
> --- a/xen/arch/x86/efi/stub.c
> +++ b/xen/arch/x86/efi/stub.c
> @@ -52,6 +52,13 @@ bool efi_enabled(unsigned int feature)
>  
>  void __init efi_init_memory(void) { }
>  
> +bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
> +{
> +    if ( start || end )

Shouldn't this be start && end?

Or else you might be de-referencing a NULL pointer?

> +        *start = *end = (unsigned long)_end;
> +    return false;
> +}
> +
>  void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t l4e) { }
>  
>  bool efi_rs_using_pgtables(void)
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -830,6 +830,7 @@ void __init noreturn __start_xen(unsigne
>      module_t *mod;
>      unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1];
>      int i, j, e820_warn = 0, bytes = 0;
> +    unsigned long eb_start, eb_end;
>      bool acpi_boot_table_init_done = false, relocated = false;
>      int ret;
>      struct ns16550_defaults ns16550 = {
> @@ -1145,7 +1146,8 @@ void __init noreturn __start_xen(unsigne
>  
>          /*
>           * This needs to remain in sync with xen_in_range() and the
> -         * respective reserve_e820_ram() invocation below.
> +         * respective reserve_e820_ram() invocation below. No need to
> +         * query efi_boot_mem_unused() here, though.
>           */
>          mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
>          mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
> @@ -1417,8 +1419,18 @@ void __init noreturn __start_xen(unsigne
>      if ( !xen_phys_start )
>          panic("Not enough memory to relocate Xen\n");
>  
> +    /* FIXME: Putting a hole in .bss would shatter the large page mapping. */
> +    if ( using_2M_mapping() )
> +        efi_boot_mem_unused(NULL, NULL);

This seems really weird IMO...

> +
>      /* This needs to remain in sync with xen_in_range(). */
> -    reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
> +    if ( efi_boot_mem_unused(&eb_start, &eb_end) )
> +    {
> +        reserve_e820_ram(&boot_e820, __pa(_stext), __pa(eb_start));
> +        reserve_e820_ram(&boot_e820, __pa(eb_end), __pa(__2M_rwdata_end));
> +    }
> +    else
> +        reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
>  
>      /* Late kexec reservation (dynamic start address). */
>      kexec_reserve_area(&boot_e820);
> @@ -1979,7 +1991,7 @@ int __hwdom_init xen_in_range(unsigned l
>      paddr_t start, end;
>      int i;
>  
> -    enum { region_s3, region_ro, region_rw, nr_regions };
> +    enum { region_s3, region_ro, region_rw, region_bss, nr_regions };
>      static struct {
>          paddr_t s, e;
>      } xen_regions[nr_regions] __hwdom_initdata;
> @@ -2004,6 +2016,14 @@ int __hwdom_init xen_in_range(unsigned l
>          /* hypervisor .data + .bss */
>          xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
>          xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
> +        if ( efi_boot_mem_unused(&start, &end) )
> +        {
> +            ASSERT(__pa(start) >= xen_regions[region_rw].s);
> +            ASSERT(__pa(end) <= xen_regions[region_rw].e);
> +            xen_regions[region_rw].e = __pa(start);
> +            xen_regions[region_bss].s = __pa(end);
> +            xen_regions[region_bss].e = __pa(&__2M_rwdata_end);
> +        }
>      }
>  
>      start = (paddr_t)mfn << PAGE_SHIFT;
> --- a/xen/arch/x86/tboot.c
> +++ b/xen/arch/x86/tboot.c
> @@ -1,3 +1,4 @@
> +#include <xen/efi.h>
>  #include <xen/init.h>
>  #include <xen/types.h>
>  #include <xen/lib.h>
> @@ -364,6 +365,8 @@ void tboot_shutdown(uint32_t shutdown_ty
>      /* if this is S3 then set regions to MAC */
>      if ( shutdown_type == TB_SHUTDOWN_S3 )
>      {
> +        unsigned long s, e;
> +
>          /*
>           * Xen regions for tboot to MAC. This needs to remain in sync with
>           * xen_in_range().
> @@ -378,6 +381,15 @@ void tboot_shutdown(uint32_t shutdown_ty
>          /* hypervisor .data + .bss */
>          g_tboot_shared->mac_regions[2].start = (uint64_t)__pa(&__2M_rwdata_start);
>          g_tboot_shared->mac_regions[2].size = __2M_rwdata_end - __2M_rwdata_start;
> +        if ( efi_boot_mem_unused(&s, &e) )
> +        {
> +            g_tboot_shared->mac_regions[2].size =
> +                s - (unsigned long)__2M_rwdata_start;
> +            g_tboot_shared->mac_regions[3].start = __pa(e);
> +            g_tboot_shared->mac_regions[3].size =
> +                (unsigned long)__2M_rwdata_end - e;
> +            g_tboot_shared->num_mac_regions = 4;
> +        }
>  
>          /*
>           * MAC domains and other Xen memory
> --- a/xen/common/efi/ebmalloc.c
> +++ b/xen/common/efi/ebmalloc.c
> @@ -1,5 +1,6 @@
>  #include "efi.h"
>  #include <xen/init.h>
> +#include <xen/mm.h>
>  
>  #ifdef CONFIG_ARM
>  /*
> @@ -21,7 +22,7 @@
>  
>  static char __section(".bss.page_aligned") __aligned(PAGE_SIZE)
>      ebmalloc_mem[EBMALLOC_SIZE];
> -static unsigned long __initdata ebmalloc_allocated;
> +static unsigned long __read_mostly ebmalloc_allocated;
>  
>  /* EFI boot allocator. */
>  void __init *ebmalloc(size_t size)
> @@ -36,17 +37,37 @@ void __init *ebmalloc(size_t size)
>      return ptr;
>  }
>  
> +bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
> +{
> +    if ( !start && !end )
> +    {
> +        ebmalloc_allocated = sizeof(ebmalloc_mem);
> +        return false;
> +    }

... I would instead place the using_2M_mapping check here and return
start = end in that case.

Thanks, Roger.
Jan Beulich Sept. 17, 2020, 10:56 a.m. UTC | #2
On 17.09.2020 12:45, Roger Pau Monné wrote:
> On Wed, Sep 16, 2020 at 02:20:54PM +0200, Jan Beulich wrote:
>> --- a/xen/arch/x86/efi/stub.c
>> +++ b/xen/arch/x86/efi/stub.c
>> @@ -52,6 +52,13 @@ bool efi_enabled(unsigned int feature)
>>  
>>  void __init efi_init_memory(void) { }
>>  
>> +bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
>> +{
>> +    if ( start || end )
> 
> Shouldn't this be start && end?

This is consistent with "if ( !start && !end )" in the non-stub
function, which was there in v3 already.

> Or else you might be de-referencing a NULL pointer?

Intentionally so: I'd view it as worse if we didn't fill *start
or *end if just one gets passed as NULL. The way it's done now
it'll be a reliable crash, as the v3 issue with the shim has
shown (where the if() here was missing).

>> @@ -1417,8 +1419,18 @@ void __init noreturn __start_xen(unsigne
>>      if ( !xen_phys_start )
>>          panic("Not enough memory to relocate Xen\n");
>>  
>> +    /* FIXME: Putting a hole in .bss would shatter the large page mapping. */
>> +    if ( using_2M_mapping() )
>> +        efi_boot_mem_unused(NULL, NULL);
> 
> This seems really weird IMO...

What I didn't like about earlier versions was the exposure of
using_2M_mapping() outside of this CU. The way it works is
somewhat fragile, and hence I think limiting its exposure is a
win. This way there's also no x86-specific code in ebmalloc.c
anymore.

I'm also slightly puzzled that you ask now when you had acked
this same construct in v3. It's really just the stub function
which has changed in v4.

>> @@ -36,17 +37,37 @@ void __init *ebmalloc(size_t size)
>>      return ptr;
>>  }
>>  
>> +bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
>> +{
>> +    if ( !start && !end )
>> +    {
>> +        ebmalloc_allocated = sizeof(ebmalloc_mem);
>> +        return false;
>> +    }
> 
> ... I would instead place the using_2M_mapping check here

As per above, this would mean x86-specific code here again.

> and return start = end in that case.

I don't think I understand this part, possibly starting with me
wondering whether you mean *start == *end (and implying they'd
be set to valid values first).

Jan
Roger Pau Monne Sept. 17, 2020, 11:17 a.m. UTC | #3
On Thu, Sep 17, 2020 at 12:56:41PM +0200, Jan Beulich wrote:
> On 17.09.2020 12:45, Roger Pau Monné wrote:
> > On Wed, Sep 16, 2020 at 02:20:54PM +0200, Jan Beulich wrote:
> >> --- a/xen/arch/x86/efi/stub.c
> >> +++ b/xen/arch/x86/efi/stub.c
> >> @@ -52,6 +52,13 @@ bool efi_enabled(unsigned int feature)
> >>  
> >>  void __init efi_init_memory(void) { }
> >>  
> >> +bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
> >> +{
> >> +    if ( start || end )
> > 
> > Shouldn't this be start && end?
> 
> This is consistent with "if ( !start && !end )" in the non-stub
> function, which was there in v3 already.

Right, I certainly didn't catch that passing one as NULL would cause a
deref there also.

I would be more comfortable with adding an ASSERT, but I'm not going
to insist. IIRC there was a time when Xen running as a PVH guest (like
in shim mode) would cause it to have a valid mapping at 0.

> > Or else you might be de-referencing a NULL pointer?
> 
> Intentionally so: I'd view it as worse if we didn't fill *start
> or *end if just one gets passed as NULL. The way it's done now
> it'll be a reliable crash, as the v3 issue with the shim has
> shown (where the if() here was missing).
> 
> >> @@ -1417,8 +1419,18 @@ void __init noreturn __start_xen(unsigne
> >>      if ( !xen_phys_start )
> >>          panic("Not enough memory to relocate Xen\n");
> >>  
> >> +    /* FIXME: Putting a hole in .bss would shatter the large page mapping. */
> >> +    if ( using_2M_mapping() )
> >> +        efi_boot_mem_unused(NULL, NULL);
> > 
> > This seems really weird IMO...
> 
> What I didn't like about earlier versions was the exposure of
> using_2M_mapping() outside of this CU. The way it works is
> somewhat fragile, and hence I think limiting its exposure is a
> win. This way there's also no x86-specific code in ebmalloc.c
> anymore.
> 
> I'm also slightly puzzled that you ask now when you had acked
> this same construct in v3. It's really just the stub function
> which has changed in v4.

Would you mind also adding a FIXME comment in efi_boot_mem_unused that
setting ebmalloc_allocated to sizeof(ebmalloc_mem) will be removed
once we can properly free the region regardless of whether 2M are
being used?

Seems like an abuse of that the function should be doing by passing
NULL pointers to it, or maybe I'm just being dense.

With that:

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.
Jan Beulich Sept. 17, 2020, 11:39 a.m. UTC | #4
On 17.09.2020 13:17, Roger Pau Monné wrote:
> On Thu, Sep 17, 2020 at 12:56:41PM +0200, Jan Beulich wrote:
>> On 17.09.2020 12:45, Roger Pau Monné wrote:
>>> On Wed, Sep 16, 2020 at 02:20:54PM +0200, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/efi/stub.c
>>>> +++ b/xen/arch/x86/efi/stub.c
>>>> @@ -52,6 +52,13 @@ bool efi_enabled(unsigned int feature)
>>>>  
>>>>  void __init efi_init_memory(void) { }
>>>>  
>>>> +bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
>>>> +{
>>>> +    if ( start || end )
>>>
>>> Shouldn't this be start && end?
>>
>> This is consistent with "if ( !start && !end )" in the non-stub
>> function, which was there in v3 already.
> 
> Right, I certainly didn't catch that passing one as NULL would cause a
> deref there also.
> 
> I would be more comfortable with adding an ASSERT, but I'm not going
> to insist. IIRC there was a time when Xen running as a PVH guest (like
> in shim mode) would cause it to have a valid mapping at 0.

Well, apparently not anymore, or else v3 wouldn't have needed prompt
reverting. With ...

>>>> @@ -1417,8 +1419,18 @@ void __init noreturn __start_xen(unsigne
>>>>      if ( !xen_phys_start )
>>>>          panic("Not enough memory to relocate Xen\n");
>>>>  
>>>> +    /* FIXME: Putting a hole in .bss would shatter the large page mapping. */
>>>> +    if ( using_2M_mapping() )
>>>> +        efi_boot_mem_unused(NULL, NULL);
>>>
>>> This seems really weird IMO...
>>
>> What I didn't like about earlier versions was the exposure of
>> using_2M_mapping() outside of this CU. The way it works is
>> somewhat fragile, and hence I think limiting its exposure is a
>> win. This way there's also no x86-specific code in ebmalloc.c
>> anymore.
>>
>> I'm also slightly puzzled that you ask now when you had acked
>> this same construct in v3. It's really just the stub function
>> which has changed in v4.
> 
> Would you mind also adding a FIXME comment in efi_boot_mem_unused that
> setting ebmalloc_allocated to sizeof(ebmalloc_mem) will be removed
> once we can properly free the region regardless of whether 2M are
> being used?

... the two FIXMEs added, it is sufficiently clear that the goal
is for this to be transient only anyway. As said - I have a plan,
I just need to find the time to see if it works out.

> Seems like an abuse of that the function should be doing by passing
> NULL pointers to it, or maybe I'm just being dense.

In a way it is an abuse, I agree, with - as said - the goal of
avoiding to expose using_2M_mapping().

> With that:
> 
> Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks much.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/efi/stub.c
+++ b/xen/arch/x86/efi/stub.c
@@ -52,6 +52,13 @@  bool efi_enabled(unsigned int feature)
 
 void __init efi_init_memory(void) { }
 
+bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
+{
+    if ( start || end )
+        *start = *end = (unsigned long)_end;
+    return false;
+}
+
 void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t l4e) { }
 
 bool efi_rs_using_pgtables(void)
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -830,6 +830,7 @@  void __init noreturn __start_xen(unsigne
     module_t *mod;
     unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1];
     int i, j, e820_warn = 0, bytes = 0;
+    unsigned long eb_start, eb_end;
     bool acpi_boot_table_init_done = false, relocated = false;
     int ret;
     struct ns16550_defaults ns16550 = {
@@ -1145,7 +1146,8 @@  void __init noreturn __start_xen(unsigne
 
         /*
          * This needs to remain in sync with xen_in_range() and the
-         * respective reserve_e820_ram() invocation below.
+         * respective reserve_e820_ram() invocation below. No need to
+         * query efi_boot_mem_unused() here, though.
          */
         mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
         mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
@@ -1417,8 +1419,18 @@  void __init noreturn __start_xen(unsigne
     if ( !xen_phys_start )
         panic("Not enough memory to relocate Xen\n");
 
+    /* FIXME: Putting a hole in .bss would shatter the large page mapping. */
+    if ( using_2M_mapping() )
+        efi_boot_mem_unused(NULL, NULL);
+
     /* This needs to remain in sync with xen_in_range(). */
-    reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
+    if ( efi_boot_mem_unused(&eb_start, &eb_end) )
+    {
+        reserve_e820_ram(&boot_e820, __pa(_stext), __pa(eb_start));
+        reserve_e820_ram(&boot_e820, __pa(eb_end), __pa(__2M_rwdata_end));
+    }
+    else
+        reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
 
     /* Late kexec reservation (dynamic start address). */
     kexec_reserve_area(&boot_e820);
@@ -1979,7 +1991,7 @@  int __hwdom_init xen_in_range(unsigned l
     paddr_t start, end;
     int i;
 
-    enum { region_s3, region_ro, region_rw, nr_regions };
+    enum { region_s3, region_ro, region_rw, region_bss, nr_regions };
     static struct {
         paddr_t s, e;
     } xen_regions[nr_regions] __hwdom_initdata;
@@ -2004,6 +2016,14 @@  int __hwdom_init xen_in_range(unsigned l
         /* hypervisor .data + .bss */
         xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
         xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
+        if ( efi_boot_mem_unused(&start, &end) )
+        {
+            ASSERT(__pa(start) >= xen_regions[region_rw].s);
+            ASSERT(__pa(end) <= xen_regions[region_rw].e);
+            xen_regions[region_rw].e = __pa(start);
+            xen_regions[region_bss].s = __pa(end);
+            xen_regions[region_bss].e = __pa(&__2M_rwdata_end);
+        }
     }
 
     start = (paddr_t)mfn << PAGE_SHIFT;
--- a/xen/arch/x86/tboot.c
+++ b/xen/arch/x86/tboot.c
@@ -1,3 +1,4 @@ 
+#include <xen/efi.h>
 #include <xen/init.h>
 #include <xen/types.h>
 #include <xen/lib.h>
@@ -364,6 +365,8 @@  void tboot_shutdown(uint32_t shutdown_ty
     /* if this is S3 then set regions to MAC */
     if ( shutdown_type == TB_SHUTDOWN_S3 )
     {
+        unsigned long s, e;
+
         /*
          * Xen regions for tboot to MAC. This needs to remain in sync with
          * xen_in_range().
@@ -378,6 +381,15 @@  void tboot_shutdown(uint32_t shutdown_ty
         /* hypervisor .data + .bss */
         g_tboot_shared->mac_regions[2].start = (uint64_t)__pa(&__2M_rwdata_start);
         g_tboot_shared->mac_regions[2].size = __2M_rwdata_end - __2M_rwdata_start;
+        if ( efi_boot_mem_unused(&s, &e) )
+        {
+            g_tboot_shared->mac_regions[2].size =
+                s - (unsigned long)__2M_rwdata_start;
+            g_tboot_shared->mac_regions[3].start = __pa(e);
+            g_tboot_shared->mac_regions[3].size =
+                (unsigned long)__2M_rwdata_end - e;
+            g_tboot_shared->num_mac_regions = 4;
+        }
 
         /*
          * MAC domains and other Xen memory
--- a/xen/common/efi/ebmalloc.c
+++ b/xen/common/efi/ebmalloc.c
@@ -1,5 +1,6 @@ 
 #include "efi.h"
 #include <xen/init.h>
+#include <xen/mm.h>
 
 #ifdef CONFIG_ARM
 /*
@@ -21,7 +22,7 @@ 
 
 static char __section(".bss.page_aligned") __aligned(PAGE_SIZE)
     ebmalloc_mem[EBMALLOC_SIZE];
-static unsigned long __initdata ebmalloc_allocated;
+static unsigned long __read_mostly ebmalloc_allocated;
 
 /* EFI boot allocator. */
 void __init *ebmalloc(size_t size)
@@ -36,17 +37,37 @@  void __init *ebmalloc(size_t size)
     return ptr;
 }
 
+bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
+{
+    if ( !start && !end )
+    {
+        ebmalloc_allocated = sizeof(ebmalloc_mem);
+        return false;
+    }
+
+    *start = (unsigned long)ebmalloc_mem + PAGE_ALIGN(ebmalloc_allocated);
+    *end = (unsigned long)ebmalloc_mem + sizeof(ebmalloc_mem);
+
+    return *start < *end;
+}
+
 void __init free_ebmalloc_unused_mem(void)
 {
-#if 0 /* FIXME: Putting a hole in the BSS breaks the IOMMU mappings for dom0. */
     unsigned long start, end;
 
-    start = (unsigned long)ebmalloc_mem + PAGE_ALIGN(ebmalloc_allocated);
-    end = (unsigned long)ebmalloc_mem + sizeof(ebmalloc_mem);
+    if ( !efi_boot_mem_unused(&start, &end) )
+        return;
 
     destroy_xen_mappings(start, end);
+
+#ifdef CONFIG_X86
+    /*
+     * By reserving the space early in the E820 map, it gets freed way before
+     * we make it here. Don't free the range a 2nd time.
+     */
+#else
     init_xenheap_pages(__pa(start), __pa(end));
+#endif
 
     printk(XENLOG_INFO "Freed %lukB unused BSS memory\n", (end - start) >> 10);
-#endif
 }
--- a/xen/include/xen/efi.h
+++ b/xen/include/xen/efi.h
@@ -33,6 +33,7 @@  struct compat_pf_efi_runtime_call;
 
 bool efi_enabled(unsigned int feature);
 void efi_init_memory(void);
+bool efi_boot_mem_unused(unsigned long *start, unsigned long *end);
 bool efi_rs_using_pgtables(void);
 unsigned long efi_get_time(void);
 void efi_halt_system(void);