diff mbox series

[RFC,2/3] x86/boot: Reserve live update boot memory

Message ID 20200108172500.1419665-2-dwmw2@infradead.org (mailing list archive)
State New, archived
Headers show
Series Live update boot memory management | expand

Commit Message

David Woodhouse Jan. 8, 2020, 5:24 p.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

For live update to work, it will need a region of memory that can be
given to the boot allocator while it parses the state information from
the previous Xen and works out which of the other pages of memory it
can consume.

Reserve that like the crashdump region, and accept it on the command
line. Use only that region for early boot, and register the remaining
RAM (all of it for now, until the real live update happens) later.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 xen/arch/x86/setup.c | 114 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 107 insertions(+), 7 deletions(-)

Comments

Jan Beulich Jan. 20, 2020, 4:58 p.m. UTC | #1
On 08.01.2020 18:24, David Woodhouse wrote:
> @@ -980,6 +1015,22 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>      set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
>      kexec_reserve_area(&boot_e820);
>  
> +    if ( lu_bootmem_start )
> +    {
> +        /* XX: Check it's in usable memory first */
> +        reserve_e820_ram(&boot_e820, lu_bootmem_start, lu_bootmem_start + lu_bootmem_size);
> +
> +        /* Since it will already be out of the e820 map by the time the first
> +         * loop over physical memory, map it manually already. */
> +        set_pdx_range(lu_bootmem_start >> PAGE_SHIFT,
> +                      (lu_bootmem_start + lu_bootmem_size) >> PAGE_SHIFT);
> +        map_pages_to_xen((unsigned long)__va(lu_bootmem_start),
> +                         maddr_to_mfn(lu_bootmem_start),
> +                         PFN_DOWN(lu_bootmem_size), PAGE_HYPERVISOR);

Doesn't this require the range to be a multiple of 2Mb and below
4Gb? I don't see this enforced anywhere.

> @@ -1278,8 +1348,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>          xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
>  
>      /*
> -     * Walk every RAM region and map it in its entirety (on x86/64, at least)
> -     * and notify it to the boot allocator.
> +     * Walk every RAM region and map it in its entirety and (unless in
> +     * live update mode) notify it to the boot allocator.
>       */
>      for ( i = 0; i < boot_e820.nr_map; i++ )
>      {
> @@ -1329,6 +1399,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>                  printk(XENLOG_WARNING "Ignoring inaccessible memory range"
>                                        " %013"PRIx64"-%013"PRIx64"\n",
>                         s, e);
> +                reserve_e820_ram(&boot_e820, s, e);
>                  continue;
>              }
>              map_e = e;
> @@ -1336,6 +1407,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>              printk(XENLOG_WARNING "Ignoring inaccessible memory range"
>                                    " %013"PRIx64"-%013"PRIx64"\n",
>                     e, map_e);
> +            reserve_e820_ram(&boot_e820, e, map_e);
>          }
>  
>          set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);

What are these two hunks needed for? The comment you change further up
relates to ...

> @@ -1346,7 +1418,9 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>                        ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT);
>  
>          /* Pass mapped memory to allocator /before/ creating new mappings. */
> -        init_boot_pages(s, min(map_s, e));
> +        if ( !lu_reserved)
> +            init_boot_pages(s, min(map_s, e));

... this afaict.

Apart from this, also applicable to patch 3 - where I have no other
comments - there's quite a bit of style cleanup to b done here. And
of course the new command line option wants documenting. I can't
e.g. guess yet what lu_data is about, and hence why this is
apparently an address without an accompanying size.

Jan
David Woodhouse Jan. 20, 2020, 5:24 p.m. UTC | #2
On Mon, 2020-01-20 at 17:58 +0100, Jan Beulich wrote:
> On 08.01.2020 18:24, David Woodhouse wrote:
> > @@ -980,6 +1015,22 @@ void __init noreturn __start_xen(unsigned long mbi_p)
> >      set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
> >      kexec_reserve_area(&boot_e820);
> >  
> > +    if ( lu_bootmem_start )
> > +    {
> > +        /* XX: Check it's in usable memory first */
> > +        reserve_e820_ram(&boot_e820, lu_bootmem_start, lu_bootmem_start + lu_bootmem_size);
> > +
> > +        /* Since it will already be out of the e820 map by the time the first
> > +         * loop over physical memory, map it manually already. */
> > +        set_pdx_range(lu_bootmem_start >> PAGE_SHIFT,
> > +                      (lu_bootmem_start + lu_bootmem_size) >> PAGE_SHIFT);
> > +        map_pages_to_xen((unsigned long)__va(lu_bootmem_start),
> > +                         maddr_to_mfn(lu_bootmem_start),
> > +                         PFN_DOWN(lu_bootmem_size), PAGE_HYPERVISOR);
> 
> Doesn't this require the range to be a multiple of 2Mb and below
> 4Gb? I don't see this enforced anywhere.

Aha, so *that's* why the mapping succeeded without having to allocate
any memory for PTEs. That did confuse me for a while, before I figured
my time was better spent in the short term by focusing on things I
don't understand that *weren't* working, rather than things I didn't
understand that *were*. :)

Yes, if this is the solution we end up with (and I do think it's still
the best option I've seen), then the requirement should be clearly
documented and enforced.

Andy and Hongyan are busy messing with all the 1:1 mappings, both at
boot time and at run time, so the actual restrictions may change.

> > @@ -1278,8 +1348,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
> >          xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
> >  
> >      /*
> > -     * Walk every RAM region and map it in its entirety (on x86/64, at least)
> > -     * and notify it to the boot allocator.
> > +     * Walk every RAM region and map it in its entirety and (unless in
> > +     * live update mode) notify it to the boot allocator.
> >       */
> >      for ( i = 0; i < boot_e820.nr_map; i++ )
> >      {
> > @@ -1329,6 +1399,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
> >                  printk(XENLOG_WARNING "Ignoring inaccessible memory range"
> >                                        " %013"PRIx64"-%013"PRIx64"\n",
> >                         s, e);
> > +                reserve_e820_ram(&boot_e820, s, e);
> >                  continue;
> >              }
> >              map_e = e;
> > @@ -1336,6 +1407,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
> >              printk(XENLOG_WARNING "Ignoring inaccessible memory range"
> >                                    " %013"PRIx64"-%013"PRIx64"\n",
> >                     e, map_e);
> > +            reserve_e820_ram(&boot_e820, e, map_e);
> >          }
> >  
> >          set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
> 
> What are these two hunks needed for? The comment you change further up
> relates to ...

When we use only the LU-reserved region for bootmem, we defer the
registration of the other regions found in E820 to a later pass, after
we've consumed the live update state (and know which pages not to
touch).

So instead of just ignoring those inaccessible regions in the first
loop as we did before, we need to *mark* them reserved in our E820 data
so that they don't get registered by that second pass.

> > @@ -1346,7 +1418,9 @@ void __init noreturn __start_xen(unsigned long mbi_p)
> >                        ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT);
> >  
> >          /* Pass mapped memory to allocator /before/ creating new mappings. */
> > -        init_boot_pages(s, min(map_s, e));
> > +        if ( !lu_reserved)
> > +            init_boot_pages(s, min(map_s, e));
> 
> ... this afaict.

Kind of, but more to the point applicable to where we later *do*
register those pages, around line 1600.

> Apart from this, also applicable to patch 3 - where I have no other
> comments - there's quite a bit of style cleanup to b done here. And
> of course the new command line option wants documenting. I can't
> e.g. guess yet what lu_data is about, and hence why this is
> apparently an address without an accompanying size.

Right. The lu_data is intended to be the 'breadcrumb' which leads to
the actual live update state, which is scatter/gather across any pages
*outside* the reserved bootmem region.

Although it's hard to put it on the command line, since that has to be
handled by *userspace*, while the live update state is created *during*
the kexec hypercall by Xen itself. We've settled for now on putting
that breadcrumb into the start of the reserved bootmem region itself,
removing the need for a separate lu_data argument.

The series continues at
https://xenbits.xen.org/gitweb/?p=people/dwmw2/xen.git;a=shortlog;h=refs/heads/lu-master
and has reached the point where I can write "Hello World" to a live
update data stream and then frown grumpily at the next Xen telling me

(XEN) 1 pages of live update data at 23e24d000
(XEN) First live update data page at MFN 23ea34:
(XEN)  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

... because the first data page got zeroed during the transition.

I'll fix that, implement the code which actually excludes busy pages
from being registered in the heap (and fix up the fact that bad pages
above HYPERVISOR_VIRT_END are also not being dropped as they should,
while I'm at it), and post a second set for comment.

I'm mostly after feedback on the direction (of which the comment about
how the first mapping succeeds was massively useful; thanks!) than the
finer details of the implementation at this point. It's just that code
is sometimes a better explanation of what I mean, than prose.
diff mbox series

Patch

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 47e065e5fe..650d70c1fc 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -678,6 +678,41 @@  static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int li
     return n;
 }
 
+static unsigned long lu_bootmem_start, lu_bootmem_size, lu_data;
+
+static int __init parse_liveupdate(const char *str)
+{
+    const char *cur;
+    lu_bootmem_size = parse_size_and_unit(cur = str, &str);
+    if (!lu_bootmem_size || cur == str)
+        return -EINVAL;
+
+    if (!*str) {
+        printk("Live update size 0x%lx\n", lu_bootmem_size);
+        return 0;
+    }
+    if (*str != '@')
+        return -EINVAL;
+    lu_bootmem_start = parse_size_and_unit(cur = str + 1, &str);
+    if (!lu_bootmem_start || cur == str)
+        return -EINVAL;
+
+    printk("Live update area 0x%lx-0x%lx (0x%lx)\n", lu_bootmem_start,
+           lu_bootmem_start + lu_bootmem_size, lu_bootmem_size);
+
+    if (!*str)
+        return 0;
+    if (*str != ':')
+        return -EINVAL;
+    lu_data = simple_strtoull(cur = str + 1, &str, 0);
+    if (!lu_data || cur == str)
+        return -EINVAL;
+
+    printk("Live update data at 0x%lx\n", lu_data);
+    return 0;
+}
+custom_param("liveupdate", parse_liveupdate);
+
 void __init noreturn __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
@@ -687,7 +722,7 @@  void __init noreturn __start_xen(unsigned long mbi_p)
     module_t *mod;
     unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1];
     int i, j, e820_warn = 0, bytes = 0;
-    bool acpi_boot_table_init_done = false, relocated = false;
+    bool acpi_boot_table_init_done = false, relocated = false, lu_reserved = false;
     int ret;
     struct ns16550_defaults ns16550 = {
         .data_bits = 8,
@@ -980,6 +1015,22 @@  void __init noreturn __start_xen(unsigned long mbi_p)
     set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
     kexec_reserve_area(&boot_e820);
 
+    if ( lu_bootmem_start )
+    {
+        /* XX: Check it's in usable memory first */
+        reserve_e820_ram(&boot_e820, lu_bootmem_start, lu_bootmem_start + lu_bootmem_size);
+
+        /* Since it will already be out of the e820 map by the time the first
+         * loop over physical memory, map it manually already. */
+        set_pdx_range(lu_bootmem_start >> PAGE_SHIFT,
+                      (lu_bootmem_start + lu_bootmem_size) >> PAGE_SHIFT);
+        map_pages_to_xen((unsigned long)__va(lu_bootmem_start),
+                         maddr_to_mfn(lu_bootmem_start),
+                         PFN_DOWN(lu_bootmem_size), PAGE_HYPERVISOR);
+
+        lu_reserved = true;
+    }
+
     initial_images = mod;
     nr_initial_images = mbi->mods_count;
 
@@ -1204,6 +1255,16 @@  void __init noreturn __start_xen(unsigned long mbi_p)
             printk("New Xen image base address: %#lx\n", xen_phys_start);
         }
 
+        /* Is the region suitable for the live update bootmem region? */
+        if ( lu_bootmem_size && ! lu_bootmem_start && e < limit )
+        {
+            end = consider_modules(s, e, lu_bootmem_size, mod, mbi->mods_count + relocated, -1);
+            if ( end )
+            {
+                e = lu_bootmem_start = end - lu_bootmem_size;
+            }
+        }
+
         /* Is the region suitable for relocating the multiboot modules? */
         for ( j = mbi->mods_count - 1; j >= 0; j-- )
         {
@@ -1267,6 +1328,15 @@  void __init noreturn __start_xen(unsigned long mbi_p)
     if ( !xen_phys_start )
         panic("Not enough memory to relocate Xen\n");
 
+    if ( lu_bootmem_start )
+    {
+        if ( !lu_reserved )
+            reserve_e820_ram(&boot_e820, lu_bootmem_start, lu_bootmem_start + lu_bootmem_size);
+        printk("LU bootmem: 0x%lx - 0x%lx\n", lu_bootmem_start, lu_bootmem_start + lu_bootmem_size);
+        init_boot_pages(lu_bootmem_start, lu_bootmem_start + lu_bootmem_size);
+        lu_reserved = true;
+    }
+
     /* This needs to remain in sync with xen_in_range(). */
     reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
 
@@ -1278,8 +1348,8 @@  void __init noreturn __start_xen(unsigned long mbi_p)
         xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
 
     /*
-     * Walk every RAM region and map it in its entirety (on x86/64, at least)
-     * and notify it to the boot allocator.
+     * Walk every RAM region and map it in its entirety and (unless in
+     * live update mode) notify it to the boot allocator.
      */
     for ( i = 0; i < boot_e820.nr_map; i++ )
     {
@@ -1329,6 +1399,7 @@  void __init noreturn __start_xen(unsigned long mbi_p)
                 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
                                       " %013"PRIx64"-%013"PRIx64"\n",
                        s, e);
+                reserve_e820_ram(&boot_e820, s, e);
                 continue;
             }
             map_e = e;
@@ -1336,6 +1407,7 @@  void __init noreturn __start_xen(unsigned long mbi_p)
             printk(XENLOG_WARNING "Ignoring inaccessible memory range"
                                   " %013"PRIx64"-%013"PRIx64"\n",
                    e, map_e);
+            reserve_e820_ram(&boot_e820, e, map_e);
         }
 
         set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
@@ -1346,7 +1418,9 @@  void __init noreturn __start_xen(unsigned long mbi_p)
                       ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT);
 
         /* Pass mapped memory to allocator /before/ creating new mappings. */
-        init_boot_pages(s, min(map_s, e));
+        if ( !lu_reserved)
+            init_boot_pages(s, min(map_s, e));
+
         s = map_s;
         if ( s < map_e )
         {
@@ -1354,7 +1428,8 @@  void __init noreturn __start_xen(unsigned long mbi_p)
 
             map_s = (s + mask) & ~mask;
             map_e &= ~mask;
-            init_boot_pages(map_s, map_e);
+            if ( !lu_reserved)
+                init_boot_pages(map_s, map_e);
         }
 
         if ( map_s > map_e )
@@ -1370,7 +1445,8 @@  void __init noreturn __start_xen(unsigned long mbi_p)
             {
                 map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
                                  PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
-                init_boot_pages(map_e, end);
+                if ( !lu_reserved)
+                    init_boot_pages(map_e, end);
                 map_e = end;
             }
         }
@@ -1385,7 +1461,8 @@  void __init noreturn __start_xen(unsigned long mbi_p)
         {
             map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
                              PFN_DOWN(map_s - s), PAGE_HYPERVISOR);
-            init_boot_pages(s, map_s);
+            if ( !lu_reserved)
+                init_boot_pages(s, map_s);
         }
     }
 
@@ -1483,6 +1560,29 @@  void __init noreturn __start_xen(unsigned long mbi_p)
 
     numa_initmem_init(0, raw_max_page);
 
+    if ( lu_bootmem_start )
+    {
+        unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
+        uint64_t mask = PAGE_SIZE - 1;
+
+        for ( i = 0; i < boot_e820.nr_map; i++ )
+        {
+            uint64_t s, e;
+
+            if ( boot_e820.map[i].type != E820_RAM )
+                continue;
+            s = (boot_e820.map[i].addr + mask) & ~mask;
+            e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
+            s = max_t(uint64_t, s, 1<<20);
+            if ( PFN_DOWN(s) > limit )
+                continue;
+            if ( PFN_DOWN(e) > limit )
+                e = pfn_to_paddr(limit);
+
+            init_boot_pages(s, e);
+        }
+    }
+
     if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
     {
         unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);