diff mbox series

[2/8] x86/setup: Fix badpage= handling for memory above HYPERVISOR_VIRT_END

Message ID 20200201003303.2363081-2-dwmw2@infradead.org (mailing list archive)
State New, archived
Headers show
Series Early cleanups and bug fixes in preparation for live update | expand

Commit Message

David Woodhouse Feb. 1, 2020, 12:32 a.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

Bad pages are identified by get_platform_badpages() and with badpage=
on the command line.

The boot allocator currently automatically elides these from the regions
passed to it with init_boot_pages(). The xenheap is then initialised
with the pages which are still marked as free by the boot allocator when
end_boot_allocator() is called.

However, any memory above HYPERVISOR_VIRT_END is passed directly to
init_domheap_pages() later in __start_xen(), and the bad page list is
not consulted.

Fix this by marking those pages as PGC_broken in the frametable at the
time end_boot_allocator() runs, and then making init_heap_pages() skip
over any pages which are so marked.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 xen/common/page_alloc.c | 82 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 3 deletions(-)

Comments

Julien Grall Feb. 3, 2020, 10:57 a.m. UTC | #1
Hi David,

On 01/02/2020 00:32, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> Bad pages are identified by get_platform_badpages() and with badpage=
> on the command line.
> 
> The boot allocator currently automatically elides these from the regions
> passed to it with init_boot_pages(). The xenheap is then initialised
> with the pages which are still marked as free by the boot allocator when
> end_boot_allocator() is called.
> 
> However, any memory above HYPERVISOR_VIRT_END is passed directly to
> init_domheap_pages() later in __start_xen(), and the bad page list is
> not consulted.
> 
> Fix this by marking those pages as PGC_broken in the frametable at the
> time end_boot_allocator() runs, and then making init_heap_pages() skip
> over any pages which are so marked.
> 
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
>   xen/common/page_alloc.c | 82 +++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 79 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 919a270587..3cf478311b 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1758,6 +1758,18 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
>       return 0;
>   }
>   
> +static unsigned long contig_avail_pages(struct page_info *pg, unsigned long max_pages)
> +{
> +    unsigned long i;
> +
> +    for ( i = 0 ; i < max_pages; i++)
> +    {
> +        if ( pg[i].count_info & PGC_broken )
> +            break;
> +    }
> +    return i;
> +}
> +
>   /*
>    * Hand the specified arbitrary page range to the specified heap zone
>    * checking the node_id of the previous page.  If they differ and the
> @@ -1799,18 +1811,23 @@ static void init_heap_pages(
>       {
>           unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
>   
> +        /* If the (first) page is already marked broken, don't add it. */
> +        if ( pg[i].count_info & PGC_broken )
> +            continue;
> +
>           if ( unlikely(!avail[nid]) )
>           {
> +            unsigned long contig_nr_pages = contig_avail_pages(pg + i, nr_pages);
>               unsigned long s = mfn_x(page_to_mfn(pg + i));
> -            unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
> +            unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + i + contig_nr_pages - 1), 1));
>               bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
>                               !(s & ((1UL << MAX_ORDER) - 1)) &&
>                               (find_first_set_bit(e) <= find_first_set_bit(s));
>               unsigned long n;
>   
> -            n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i,
> +            n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), contig_nr_pages,
>                                  &use_tail);
> -            BUG_ON(i + n > nr_pages);
> +            BUG_ON(n > contig_nr_pages);
>               if ( n && !use_tail )
>               {
>                   i += n - 1;
> @@ -1846,6 +1863,63 @@ static unsigned long avail_heap_pages(
>       return free_pages;
>   }
>   
> +static void mark_bad_pages(void)
> +{
> +    unsigned long bad_spfn, bad_epfn;

I would like to avoid adding more 'pfn' in the code. Per the current 
terminology (see include/xen/mm.h), this would be bad_smfn, bad_emfn.

Ideally, this should also use mfn_t, but I can live without it for now.

> +    const char *p;
> +    struct page_info *pg;
> +#ifdef CONFIG_X86
> +    const struct platform_bad_page *badpage;
> +    unsigned int i, j, array_size;
> +
> +    badpage = get_platform_badpages(&array_size);
> +    if ( badpage )
> +    {
> +        for ( i = 0; i < array_size; i++ )
> +        {
> +            for ( j = 0; j < 1UL << badpage->order; j++ )
> +            {
> +                if ( mfn_valid(_mfn(badpage->mfn + j)) )
> +                {
> +                    pg = mfn_to_page(_mfn(badpage->mfn + j));
> +                    pg->count_info |= PGC_broken;
> +                    page_list_add_tail(pg, &page_broken_list);
> +                }
> +            }
> +        }
> +    }

You are missing the PV shim part here.

But, AFAICT, the only difference with the implementation in 
init_boot_pages() is how we treat the page.

Could you we introduce a common helper taking a callback? The callback 
would be applied on every range range.

> +#endif
> +
> +    /* Check new pages against the bad-page list. */
> +    p = opt_badpage;
> +    while ( *p != '\0' )
> +    {
> +        bad_spfn = simple_strtoul(p, &p, 0);
> +        bad_epfn = bad_spfn;
> +
> +        if ( *p == '-' )
> +        {
> +            p++;
> +            bad_epfn = simple_strtoul(p, &p, 0);
> +            if ( bad_epfn < bad_spfn )
> +                bad_epfn = bad_spfn;
> +        }
> +
> +        if ( *p == ',' )
> +            p++;
> +        else if ( *p != '\0' )
> +            break;
> +
> +        while ( mfn_valid(_mfn(bad_spfn)) && bad_spfn < bad_epfn )
> +        {
> +            pg = mfn_to_page(_mfn(bad_spfn));
> +            pg->count_info |= PGC_broken;
> +            page_list_add_tail(pg, &page_broken_list);
> +            bad_spfn++;
> +        }
> +    }
> +}
> +
>   void __init end_boot_allocator(void)
>   {
>       unsigned int i;
> @@ -1870,6 +1944,8 @@ void __init end_boot_allocator(void)
>       }
>       nr_bootmem_regions = 0;
>   
> +    mark_bad_pages();
> +
>       if ( !dma_bitsize && (num_online_nodes() > 1) )
>           dma_bitsize = arch_get_dma_bitsize();
>   
> 

Cheers,
Jan Beulich Feb. 20, 2020, 3:38 p.m. UTC | #2
On 01.02.2020 01:32, David Woodhouse wrote:
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1758,6 +1758,18 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
>      return 0;
>  }
>  
> +static unsigned long contig_avail_pages(struct page_info *pg, unsigned long max_pages)
> +{
> +    unsigned long i;
> +
> +    for ( i = 0 ; i < max_pages; i++)

Nit: Stray blank before first semicolon.

> +    {
> +        if ( pg[i].count_info & PGC_broken )
> +            break;
> +    }
> +    return i;

Further nits: Commonly we omit the braces in cases like this one.
We also like to have blank lines before the main return statement
of a function.

> @@ -1846,6 +1863,63 @@ static unsigned long avail_heap_pages(
>      return free_pages;
>  }
>  
> +static void mark_bad_pages(void)

__init please

> +{
> +    unsigned long bad_spfn, bad_epfn;
> +    const char *p;
> +    struct page_info *pg;
> +#ifdef CONFIG_X86
> +    const struct platform_bad_page *badpage;
> +    unsigned int i, j, array_size;
> +
> +    badpage = get_platform_badpages(&array_size);
> +    if ( badpage )
> +    {
> +        for ( i = 0; i < array_size; i++ )
> +        {
> +            for ( j = 0; j < 1UL << badpage->order; j++ )

Either you mean badpage[i].* here and below, or you're missing an
increment of badpage somewhere.

> +            {
> +                if ( mfn_valid(_mfn(badpage->mfn + j)) )
> +                {
> +                    pg = mfn_to_page(_mfn(badpage->mfn + j));
> +                    pg->count_info |= PGC_broken;
> +                    page_list_add_tail(pg, &page_broken_list);
> +                }
> +            }
> +        }
> +    }
> +#endif
> +
> +    /* Check new pages against the bad-page list. */
> +    p = opt_badpage;
> +    while ( *p != '\0' )
> +    {
> +        bad_spfn = simple_strtoul(p, &p, 0);
> +        bad_epfn = bad_spfn;
> +
> +        if ( *p == '-' )
> +        {
> +            p++;
> +            bad_epfn = simple_strtoul(p, &p, 0);
> +            if ( bad_epfn < bad_spfn )
> +                bad_epfn = bad_spfn;
> +        }
> +
> +        if ( *p == ',' )
> +            p++;
> +        else if ( *p != '\0' )
> +            break;

I think this common (with init_boot_pages()) part of parsing
would better abstracted out, such there will be just one
instance of, and hence there's no risk of things going out of
sync.

> +        while ( mfn_valid(_mfn(bad_spfn)) && bad_spfn < bad_epfn )

As per init_boot_pages() as well as per the "bad_epfn = bad_spfn;"
you have further up, the range here is inclusive at its end. I'm
also uncertain about the stopping at the first !mfn_valid() -
there may well be further valid pages later on.

Jan
Julien Grall March 6, 2020, 10:52 p.m. UTC | #3
Hi David,

On 01/02/2020 00:32, David Woodhouse wrote:
>   /*
>    * Hand the specified arbitrary page range to the specified heap zone
>    * checking the node_id of the previous page.  If they differ and the
> @@ -1799,18 +1811,23 @@ static void init_heap_pages(
>       {
>           unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
>   
> +        /* If the (first) page is already marked broken, don't add it. */
> +        if ( pg[i].count_info & PGC_broken )
> +            continue;
> +
>           if ( unlikely(!avail[nid]) )
>           {
> +            unsigned long contig_nr_pages = contig_avail_pages(pg + i, nr_pages);

I spent the evening chasing a bug due to this change. Xen may give to 
init_heap_pages() a range that crosses NUMA node.

contig_avail_pages will go through the range [pg + 1, pg + i + 
nr_pages). As nr_pages corresponds to the number of pages in the 
original range, we may overrun the frametable and crash.

So I think you want to use contig_avail_pages(pg + i, nr_pages - i).

Cheers,
diff mbox series

Patch

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 919a270587..3cf478311b 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1758,6 +1758,18 @@  int query_page_offline(mfn_t mfn, uint32_t *status)
     return 0;
 }
 
+static unsigned long contig_avail_pages(struct page_info *pg, unsigned long max_pages)
+{
+    unsigned long i;
+
+    for ( i = 0 ; i < max_pages; i++)
+    {
+        if ( pg[i].count_info & PGC_broken )
+            break;
+    }
+    return i;
+}
+
 /*
  * Hand the specified arbitrary page range to the specified heap zone
  * checking the node_id of the previous page.  If they differ and the
@@ -1799,18 +1811,23 @@  static void init_heap_pages(
     {
         unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
 
+        /* If the (first) page is already marked broken, don't add it. */
+        if ( pg[i].count_info & PGC_broken )
+            continue;
+
         if ( unlikely(!avail[nid]) )
         {
+            unsigned long contig_nr_pages = contig_avail_pages(pg + i, nr_pages);
             unsigned long s = mfn_x(page_to_mfn(pg + i));
-            unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
+            unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + i + contig_nr_pages - 1), 1));
             bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
                             !(s & ((1UL << MAX_ORDER) - 1)) &&
                             (find_first_set_bit(e) <= find_first_set_bit(s));
             unsigned long n;
 
-            n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i,
+            n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), contig_nr_pages,
                                &use_tail);
-            BUG_ON(i + n > nr_pages);
+            BUG_ON(n > contig_nr_pages);
             if ( n && !use_tail )
             {
                 i += n - 1;
@@ -1846,6 +1863,63 @@  static unsigned long avail_heap_pages(
     return free_pages;
 }
 
+static void mark_bad_pages(void)
+{
+    unsigned long bad_spfn, bad_epfn;
+    const char *p;
+    struct page_info *pg;
+#ifdef CONFIG_X86
+    const struct platform_bad_page *badpage;
+    unsigned int i, j, array_size;
+
+    badpage = get_platform_badpages(&array_size);
+    if ( badpage )
+    {
+        for ( i = 0; i < array_size; i++ )
+        {
+            for ( j = 0; j < 1UL << badpage->order; j++ )
+            {
+                if ( mfn_valid(_mfn(badpage->mfn + j)) )
+                {
+                    pg = mfn_to_page(_mfn(badpage->mfn + j));
+                    pg->count_info |= PGC_broken;
+                    page_list_add_tail(pg, &page_broken_list);
+                }
+            }
+        }
+    }
+#endif
+
+    /* Check new pages against the bad-page list. */
+    p = opt_badpage;
+    while ( *p != '\0' )
+    {
+        bad_spfn = simple_strtoul(p, &p, 0);
+        bad_epfn = bad_spfn;
+
+        if ( *p == '-' )
+        {
+            p++;
+            bad_epfn = simple_strtoul(p, &p, 0);
+            if ( bad_epfn < bad_spfn )
+                bad_epfn = bad_spfn;
+        }
+
+        if ( *p == ',' )
+            p++;
+        else if ( *p != '\0' )
+            break;
+
+        while ( mfn_valid(_mfn(bad_spfn)) && bad_spfn < bad_epfn )
+        {
+            pg = mfn_to_page(_mfn(bad_spfn));
+            pg->count_info |= PGC_broken;
+            page_list_add_tail(pg, &page_broken_list);
+            bad_spfn++;
+        }
+    }
+}
+
 void __init end_boot_allocator(void)
 {
     unsigned int i;
@@ -1870,6 +1944,8 @@  void __init end_boot_allocator(void)
     }
     nr_bootmem_regions = 0;
 
+    mark_bad_pages();
+
     if ( !dma_bitsize && (num_online_nodes() > 1) )
         dma_bitsize = arch_get_dma_bitsize();