Message ID | 20200201003303.2363081-2-dwmw2@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Early cleanups and bug fixes in preparation for live update | expand |
Hi David, On 01/02/2020 00:32, David Woodhouse wrote: > From: David Woodhouse <dwmw@amazon.co.uk> > > Bad pages are identified by get_platform_badpages() and with badpage= > on the command line. > > The boot allocator currently automatically elides these from the regions > passed to it with init_boot_pages(). The xenheap is then initialised > with the pages which are still marked as free by the boot allocator when > end_boot_allocator() is called. > > However, any memory above HYPERVISOR_VIRT_END is passed directly to > init_domheap_pages() later in __start_xen(), and the bad page list is > not consulted. > > Fix this by marking those pages as PGC_broken in the frametable at the > time end_boot_allocator() runs, and then making init_heap_pages() skip > over any pages which are so marked. > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > --- > xen/common/page_alloc.c | 82 +++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 79 insertions(+), 3 deletions(-) > > diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c > index 919a270587..3cf478311b 100644 > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -1758,6 +1758,18 @@ int query_page_offline(mfn_t mfn, uint32_t *status) > return 0; > } > > +static unsigned long contig_avail_pages(struct page_info *pg, unsigned long max_pages) > +{ > + unsigned long i; > + > + for ( i = 0 ; i < max_pages; i++) > + { > + if ( pg[i].count_info & PGC_broken ) > + break; > + } > + return i; > +} > + > /* > * Hand the specified arbitrary page range to the specified heap zone > * checking the node_id of the previous page. If they differ and the > @@ -1799,18 +1811,23 @@ static void init_heap_pages( > { > unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); > > + /* If the (first) page is already marked broken, don't add it. */ > + if ( pg[i].count_info & PGC_broken ) > + continue; > + > if ( unlikely(!avail[nid]) ) > { > + unsigned long contig_nr_pages = contig_avail_pages(pg + i, nr_pages); > unsigned long s = mfn_x(page_to_mfn(pg + i)); > - unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1)); > + unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + i + contig_nr_pages - 1), 1)); > bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) && > !(s & ((1UL << MAX_ORDER) - 1)) && > (find_first_set_bit(e) <= find_first_set_bit(s)); > unsigned long n; > > - n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i, > + n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), contig_nr_pages, > &use_tail); > - BUG_ON(i + n > nr_pages); > + BUG_ON(n > contig_nr_pages); > if ( n && !use_tail ) > { > i += n - 1; > @@ -1846,6 +1863,63 @@ static unsigned long avail_heap_pages( > return free_pages; > } > > +static void mark_bad_pages(void) > +{ > + unsigned long bad_spfn, bad_epfn; I would like to avoid adding more 'pfn' in the code. Per the current terminology (see include/xen/mm.h), this would be bad_smfn, bad_emfn. Ideally, this should also use mfn_t, but I can live without it for now. > + const char *p; > + struct page_info *pg; > +#ifdef CONFIG_X86 > + const struct platform_bad_page *badpage; > + unsigned int i, j, array_size; > + > + badpage = get_platform_badpages(&array_size); > + if ( badpage ) > + { > + for ( i = 0; i < array_size; i++ ) > + { > + for ( j = 0; j < 1UL << badpage->order; j++ ) > + { > + if ( mfn_valid(_mfn(badpage->mfn + j)) ) > + { > + pg = mfn_to_page(_mfn(badpage->mfn + j)); > + pg->count_info |= PGC_broken; > + page_list_add_tail(pg, &page_broken_list); > + } > + } > + } > + } You are missing the PV shim part here. But, AFAICT, the only difference with the implementation in init_boot_pages() is how we treat the page. Could you we introduce a common helper taking a callback? The callback would be applied on every range range. > +#endif > + > + /* Check new pages against the bad-page list. */ > + p = opt_badpage; > + while ( *p != '\0' ) > + { > + bad_spfn = simple_strtoul(p, &p, 0); > + bad_epfn = bad_spfn; > + > + if ( *p == '-' ) > + { > + p++; > + bad_epfn = simple_strtoul(p, &p, 0); > + if ( bad_epfn < bad_spfn ) > + bad_epfn = bad_spfn; > + } > + > + if ( *p == ',' ) > + p++; > + else if ( *p != '\0' ) > + break; > + > + while ( mfn_valid(_mfn(bad_spfn)) && bad_spfn < bad_epfn ) > + { > + pg = mfn_to_page(_mfn(bad_spfn)); > + pg->count_info |= PGC_broken; > + page_list_add_tail(pg, &page_broken_list); > + bad_spfn++; > + } > + } > +} > + > void __init end_boot_allocator(void) > { > unsigned int i; > @@ -1870,6 +1944,8 @@ void __init end_boot_allocator(void) > } > nr_bootmem_regions = 0; > > + mark_bad_pages(); > + > if ( !dma_bitsize && (num_online_nodes() > 1) ) > dma_bitsize = arch_get_dma_bitsize(); > > Cheers,
On 01.02.2020 01:32, David Woodhouse wrote: > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -1758,6 +1758,18 @@ int query_page_offline(mfn_t mfn, uint32_t *status) > return 0; > } > > +static unsigned long contig_avail_pages(struct page_info *pg, unsigned long max_pages) > +{ > + unsigned long i; > + > + for ( i = 0 ; i < max_pages; i++) Nit: Stray blank before first semicolon. > + { > + if ( pg[i].count_info & PGC_broken ) > + break; > + } > + return i; Further nits: Commonly we omit the braces in cases like this one. We also like to have blank lines before the main return statement of a function. > @@ -1846,6 +1863,63 @@ static unsigned long avail_heap_pages( > return free_pages; > } > > +static void mark_bad_pages(void) __init please > +{ > + unsigned long bad_spfn, bad_epfn; > + const char *p; > + struct page_info *pg; > +#ifdef CONFIG_X86 > + const struct platform_bad_page *badpage; > + unsigned int i, j, array_size; > + > + badpage = get_platform_badpages(&array_size); > + if ( badpage ) > + { > + for ( i = 0; i < array_size; i++ ) > + { > + for ( j = 0; j < 1UL << badpage->order; j++ ) Either you mean badpage[i].* here and below, or you're missing an increment of badpage somewhere. > + { > + if ( mfn_valid(_mfn(badpage->mfn + j)) ) > + { > + pg = mfn_to_page(_mfn(badpage->mfn + j)); > + pg->count_info |= PGC_broken; > + page_list_add_tail(pg, &page_broken_list); > + } > + } > + } > + } > +#endif > + > + /* Check new pages against the bad-page list. */ > + p = opt_badpage; > + while ( *p != '\0' ) > + { > + bad_spfn = simple_strtoul(p, &p, 0); > + bad_epfn = bad_spfn; > + > + if ( *p == '-' ) > + { > + p++; > + bad_epfn = simple_strtoul(p, &p, 0); > + if ( bad_epfn < bad_spfn ) > + bad_epfn = bad_spfn; > + } > + > + if ( *p == ',' ) > + p++; > + else if ( *p != '\0' ) > + break; I think this common (with init_boot_pages()) part of parsing would better abstracted out, such there will be just one instance of, and hence there's no risk of things going out of sync. > + while ( mfn_valid(_mfn(bad_spfn)) && bad_spfn < bad_epfn ) As per init_boot_pages() as well as per the "bad_epfn = bad_spfn;" you have further up, the range here is inclusive at its end. I'm also uncertain about the stopping at the first !mfn_valid() - there may well be further valid pages later on. Jan
Hi David, On 01/02/2020 00:32, David Woodhouse wrote: > /* > * Hand the specified arbitrary page range to the specified heap zone > * checking the node_id of the previous page. If they differ and the > @@ -1799,18 +1811,23 @@ static void init_heap_pages( > { > unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); > > + /* If the (first) page is already marked broken, don't add it. */ > + if ( pg[i].count_info & PGC_broken ) > + continue; > + > if ( unlikely(!avail[nid]) ) > { > + unsigned long contig_nr_pages = contig_avail_pages(pg + i, nr_pages); I spent the evening chasing a bug due to this change. Xen may give to init_heap_pages() a range that crosses NUMA node. contig_avail_pages will go through the range [pg + 1, pg + i + nr_pages). As nr_pages corresponds to the number of pages in the original range, we may overrun the frametable and crash. So I think you want to use contig_avail_pages(pg + i, nr_pages - i). Cheers,
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 919a270587..3cf478311b 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -1758,6 +1758,18 @@ int query_page_offline(mfn_t mfn, uint32_t *status) return 0; } +static unsigned long contig_avail_pages(struct page_info *pg, unsigned long max_pages) +{ + unsigned long i; + + for ( i = 0 ; i < max_pages; i++) + { + if ( pg[i].count_info & PGC_broken ) + break; + } + return i; +} + /* * Hand the specified arbitrary page range to the specified heap zone * checking the node_id of the previous page. If they differ and the @@ -1799,18 +1811,23 @@ static void init_heap_pages( { unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); + /* If the (first) page is already marked broken, don't add it. */ + if ( pg[i].count_info & PGC_broken ) + continue; + if ( unlikely(!avail[nid]) ) { + unsigned long contig_nr_pages = contig_avail_pages(pg + i, nr_pages); unsigned long s = mfn_x(page_to_mfn(pg + i)); - unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1)); + unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + i + contig_nr_pages - 1), 1)); bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) && !(s & ((1UL << MAX_ORDER) - 1)) && (find_first_set_bit(e) <= find_first_set_bit(s)); unsigned long n; - n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i, + n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), contig_nr_pages, &use_tail); - BUG_ON(i + n > nr_pages); + BUG_ON(n > contig_nr_pages); if ( n && !use_tail ) { i += n - 1; @@ -1846,6 +1863,63 @@ static unsigned long avail_heap_pages( return free_pages; } +static void mark_bad_pages(void) +{ + unsigned long bad_spfn, bad_epfn; + const char *p; + struct page_info *pg; +#ifdef CONFIG_X86 + const struct platform_bad_page *badpage; + unsigned int i, j, array_size; + + badpage = get_platform_badpages(&array_size); + if ( badpage ) + { + for ( i = 0; i < array_size; i++ ) + { + for ( j = 0; j < 1UL << badpage->order; j++ ) + { + if ( mfn_valid(_mfn(badpage->mfn + j)) ) + { + pg = mfn_to_page(_mfn(badpage->mfn + j)); + pg->count_info |= PGC_broken; + page_list_add_tail(pg, &page_broken_list); + } + } + } + } +#endif + + /* Check new pages against the bad-page list. */ + p = opt_badpage; + while ( *p != '\0' ) + { + bad_spfn = simple_strtoul(p, &p, 0); + bad_epfn = bad_spfn; + + if ( *p == '-' ) + { + p++; + bad_epfn = simple_strtoul(p, &p, 0); + if ( bad_epfn < bad_spfn ) + bad_epfn = bad_spfn; + } + + if ( *p == ',' ) + p++; + else if ( *p != '\0' ) + break; + + while ( mfn_valid(_mfn(bad_spfn)) && bad_spfn < bad_epfn ) + { + pg = mfn_to_page(_mfn(bad_spfn)); + pg->count_info |= PGC_broken; + page_list_add_tail(pg, &page_broken_list); + bad_spfn++; + } + } +} + void __init end_boot_allocator(void) { unsigned int i; @@ -1870,6 +1944,8 @@ void __init end_boot_allocator(void) } nr_bootmem_regions = 0; + mark_bad_pages(); + if ( !dma_bitsize && (num_online_nodes() > 1) ) dma_bitsize = arch_get_dma_bitsize();