diff mbox series

mm/page_alloc: Skip non present sections on zone initialization

Message ID 20191230093828.24613-1-kirill.shutemov@linux.intel.com
State New, archived
Headers show
Series mm/page_alloc: Skip non present sections on zone initialization | expand

Commit Message

Kirill A. Shutemov Dec. 30, 2019, 9:38 a.m. UTC
memmap_init_zone() can be called on the ranges with holes during the
boot. It will skip any non-valid PFNs one-by-one. It works fine as long
as holes are not too big.

But huge holes in the memory map causes a problem. It takes over 20
seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
larger holes in the memory map which would practically hang the system.

Deferred struct page init doesn't help here. It only works on the
present ranges.

Skipping non-present sections would fix the issue.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---

The situation can be emulated using the following QEMU patch:

Comments

Baoquan He Dec. 31, 2019, 1:23 a.m. UTC | #1
On 12/30/19 at 12:38pm, Kirill A. Shutemov wrote:
> memmap_init_zone() can be called on the ranges with holes during the
> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
> as holes are not too big.
> 
> But huge holes in the memory map causes a problem. It takes over 20
> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
> larger holes in the memory map which would practically hang the system.
> 
> Deferred struct page init doesn't help here. It only works on the
> present ranges.
> 
> Skipping non-present sections would fix the issue.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
> 
> The situation can be emulated using the following QEMU patch:
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index ac08e6360437..f5f2258092e1 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1159,13 +1159,14 @@ void pc_memory_init(PCMachineState *pcms,
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
>      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
>      if (x86ms->above_4g_mem_size > 0) {
> +        int shift = 45;
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
>          memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>                                   x86ms->below_4g_mem_size,
>                                   x86ms->above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        memory_region_add_subregion(system_memory, 1ULL << shift,
>                                      ram_above_4g);
> -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> +        e820_add_entry(1ULL << shift, x86ms->above_4g_mem_size, E820_RAM);
>      }
>  
>      if (!pcmc->has_reserved_memory &&
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index cde2a16b941a..694c26947bf6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1928,7 +1928,7 @@ uint64_t cpu_get_tsc(CPUX86State *env);
>  /* XXX: This value should match the one returned by CPUID
>   * and in exec.c */
>  # if defined(TARGET_X86_64)
> -# define TCG_PHYS_ADDR_BITS 40
> +# define TCG_PHYS_ADDR_BITS 52
>  # else
>  # define TCG_PHYS_ADDR_BITS 36
>  # endif
> 
> ---
>  mm/page_alloc.c | 28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index df62a49cd09e..442dc0244bb4 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5873,6 +5873,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>  	return false;
>  }
>  
> +#ifdef CONFIG_SPARSEMEM
> +/* Skip PFNs that belong to non-present sections */
> +static inline __meminit unsigned long next_pfn(unsigned long pfn)
> +{
> +	unsigned long section_nr;
> +
> +	section_nr = pfn_to_section_nr(++pfn);
> +	if (present_section_nr(section_nr))
> +		return pfn;
> +
> +	while (++section_nr <= __highest_present_section_nr) {
> +		if (present_section_nr(section_nr))
> +			return section_nr_to_pfn(section_nr);
> +	}
> +
> +	return -1;
> +}
> +#else
> +static inline __meminit unsigned long next_pfn(unsigned long pfn)
> +{
> +	return pfn++;
> +}
> +#endif
> +
>  /*
>   * Initially all pages are reserved - free ones are freed
>   * up by memblock_free_all() once the early boot process is
> @@ -5912,8 +5936,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  		 * function.  They do not exist on hotplugged memory.
>  		 */
>  		if (context == MEMMAP_EARLY) {
> -			if (!early_pfn_valid(pfn))
> +			if (!early_pfn_valid(pfn)) {
> +				pfn = next_pfn(pfn) - 1;

Just pass by, I think this is a necessary optimization. Wondering why
next_pfn(pfn) is not put in for loop:
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn=next_pfn(pfn)) {


>  				continue;
> +			}
>  			if (!early_pfn_in_nid(pfn, nid))
>  				continue;

Why the other two 'continue' don't need be worried on the huge hole
case?

Thanks
Baoquan
Baoquan He Dec. 31, 2019, 1:33 a.m. UTC | #2
On 12/31/19 at 09:23am, Baoquan He wrote:
> On 12/30/19 at 12:38pm, Kirill A. Shutemov wrote:
> > memmap_init_zone() can be called on the ranges with holes during the
> > boot. It will skip any non-valid PFNs one-by-one. It works fine as long
> > as holes are not too big.
> > 
> > But huge holes in the memory map causes a problem. It takes over 20
> > seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
> > larger holes in the memory map which would practically hang the system.
> > 
> > Deferred struct page init doesn't help here. It only works on the
> > present ranges.
> > 
> > Skipping non-present sections would fix the issue.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> > 
> > The situation can be emulated using the following QEMU patch:
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index ac08e6360437..f5f2258092e1 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1159,13 +1159,14 @@ void pc_memory_init(PCMachineState *pcms,
> >      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> >      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
> >      if (x86ms->above_4g_mem_size > 0) {
> > +        int shift = 45;
> >          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> >          memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> >                                   x86ms->below_4g_mem_size,
> >                                   x86ms->above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +        memory_region_add_subregion(system_memory, 1ULL << shift,
> >                                      ram_above_4g);
> > -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> > +        e820_add_entry(1ULL << shift, x86ms->above_4g_mem_size, E820_RAM);
> >      }
> >  
> >      if (!pcmc->has_reserved_memory &&
> > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > index cde2a16b941a..694c26947bf6 100644
> > --- a/target/i386/cpu.h
> > +++ b/target/i386/cpu.h
> > @@ -1928,7 +1928,7 @@ uint64_t cpu_get_tsc(CPUX86State *env);
> >  /* XXX: This value should match the one returned by CPUID
> >   * and in exec.c */
> >  # if defined(TARGET_X86_64)
> > -# define TCG_PHYS_ADDR_BITS 40
> > +# define TCG_PHYS_ADDR_BITS 52
> >  # else
> >  # define TCG_PHYS_ADDR_BITS 36
> >  # endif
> > 
> > ---
> >  mm/page_alloc.c | 28 +++++++++++++++++++++++++++-
> >  1 file changed, 27 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index df62a49cd09e..442dc0244bb4 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -5873,6 +5873,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
> >  	return false;
> >  }
> >  
> > +#ifdef CONFIG_SPARSEMEM
> > +/* Skip PFNs that belong to non-present sections */
> > +static inline __meminit unsigned long next_pfn(unsigned long pfn)
> > +{
> > +	unsigned long section_nr;
> > +
> > +	section_nr = pfn_to_section_nr(++pfn);
> > +	if (present_section_nr(section_nr))
> > +		return pfn;
> > +
> > +	while (++section_nr <= __highest_present_section_nr) {
> > +		if (present_section_nr(section_nr))
> > +			return section_nr_to_pfn(section_nr);
> > +	}
> > +
> > +	return -1;
> > +}
> > +#else
> > +static inline __meminit unsigned long next_pfn(unsigned long pfn)
> > +{
> > +	return pfn++;
> > +}
> > +#endif
> > +
> >  /*
> >   * Initially all pages are reserved - free ones are freed
> >   * up by memblock_free_all() once the early boot process is
> > @@ -5912,8 +5936,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> >  		 * function.  They do not exist on hotplugged memory.
> >  		 */
> >  		if (context == MEMMAP_EARLY) {
> > -			if (!early_pfn_valid(pfn))
> > +			if (!early_pfn_valid(pfn)) {
> > +				pfn = next_pfn(pfn) - 1;
> 
> Just pass by, I think this is a necessary optimization. Wondering why
> next_pfn(pfn) is not put in for loop:
> -	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +	for (pfn = start_pfn; pfn < end_pfn; pfn=next_pfn(pfn)) {
> 
> 
> >  				continue;
> > +			}
> >  			if (!early_pfn_in_nid(pfn, nid))
> >  				continue;
> 
> Why the other two 'continue' don't need be worried on the huge hole
> case?

OK, I see. early_pfn_valid() may have encountered the huge hole case,
the check in patch sounds reasonable.

FWIW, looks good to me.

Reviewed-by: Baoquan He <bhe@redhat.com>

Thanks
Baoquan
Michal Hocko Jan. 8, 2020, 2:40 p.m. UTC | #3
On Mon 30-12-19 12:38:28, Kirill A. Shutemov wrote:
> memmap_init_zone() can be called on the ranges with holes during the
> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
> as holes are not too big.
> 
> But huge holes in the memory map causes a problem. It takes over 20
> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
> larger holes in the memory map which would practically hang the system.
> 
> Deferred struct page init doesn't help here. It only works on the
> present ranges.
> 
> Skipping non-present sections would fix the issue.

Makes sense to me.

> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

That pfn inc back and forth is quite ugly TBH but whatever.
Acked-by: Michal Hocko <mhocko@suse.com>

> ---
> 
> The situation can be emulated using the following QEMU patch:
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index ac08e6360437..f5f2258092e1 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1159,13 +1159,14 @@ void pc_memory_init(PCMachineState *pcms,
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
>      e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
>      if (x86ms->above_4g_mem_size > 0) {
> +        int shift = 45;
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
>          memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>                                   x86ms->below_4g_mem_size,
>                                   x86ms->above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        memory_region_add_subregion(system_memory, 1ULL << shift,
>                                      ram_above_4g);
> -        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
> +        e820_add_entry(1ULL << shift, x86ms->above_4g_mem_size, E820_RAM);
>      }
>  
>      if (!pcmc->has_reserved_memory &&
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index cde2a16b941a..694c26947bf6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1928,7 +1928,7 @@ uint64_t cpu_get_tsc(CPUX86State *env);
>  /* XXX: This value should match the one returned by CPUID
>   * and in exec.c */
>  # if defined(TARGET_X86_64)
> -# define TCG_PHYS_ADDR_BITS 40
> +# define TCG_PHYS_ADDR_BITS 52
>  # else
>  # define TCG_PHYS_ADDR_BITS 36
>  # endif
> 
> ---
>  mm/page_alloc.c | 28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index df62a49cd09e..442dc0244bb4 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5873,6 +5873,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>  	return false;
>  }
>  
> +#ifdef CONFIG_SPARSEMEM
> +/* Skip PFNs that belong to non-present sections */
> +static inline __meminit unsigned long next_pfn(unsigned long pfn)
> +{
> +	unsigned long section_nr;
> +
> +	section_nr = pfn_to_section_nr(++pfn);
> +	if (present_section_nr(section_nr))
> +		return pfn;
> +
> +	while (++section_nr <= __highest_present_section_nr) {
> +		if (present_section_nr(section_nr))
> +			return section_nr_to_pfn(section_nr);
> +	}
> +
> +	return -1;
> +}
> +#else
> +static inline __meminit unsigned long next_pfn(unsigned long pfn)
> +{
> +	return pfn++;
> +}
> +#endif
> +
>  /*
>   * Initially all pages are reserved - free ones are freed
>   * up by memblock_free_all() once the early boot process is
> @@ -5912,8 +5936,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  		 * function.  They do not exist on hotplugged memory.
>  		 */
>  		if (context == MEMMAP_EARLY) {
> -			if (!early_pfn_valid(pfn))
> +			if (!early_pfn_valid(pfn)) {
> +				pfn = next_pfn(pfn) - 1;
>  				continue;
> +			}
>  			if (!early_pfn_in_nid(pfn, nid))
>  				continue;
>  			if (overlap_memmap_init(zone, &pfn))
> -- 
> 2.24.1
>
David Hildenbrand Jan. 10, 2020, 1:15 p.m. UTC | #4
On 08.01.20 15:40, Michal Hocko wrote:
> On Mon 30-12-19 12:38:28, Kirill A. Shutemov wrote:
>> memmap_init_zone() can be called on the ranges with holes during the
>> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
>> as holes are not too big.
>>
>> But huge holes in the memory map causes a problem. It takes over 20
>> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
>> larger holes in the memory map which would practically hang the system.
>>
>> Deferred struct page init doesn't help here. It only works on the
>> present ranges.
>>
>> Skipping non-present sections would fix the issue.
> 
> Makes sense to me.
> 
>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> 
> That pfn inc back and forth is quite ugly TBH but whatever.

Indeed, can we please rewrite the loop to fix that?
Kirill A. Shutemov Jan. 10, 2020, 1:45 p.m. UTC | #5
On Fri, Jan 10, 2020 at 02:15:26PM +0100, David Hildenbrand wrote:
> On 08.01.20 15:40, Michal Hocko wrote:
> > On Mon 30-12-19 12:38:28, Kirill A. Shutemov wrote:
> >> memmap_init_zone() can be called on the ranges with holes during the
> >> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
> >> as holes are not too big.
> >>
> >> But huge holes in the memory map causes a problem. It takes over 20
> >> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
> >> larger holes in the memory map which would practically hang the system.
> >>
> >> Deferred struct page init doesn't help here. It only works on the
> >> present ranges.
> >>
> >> Skipping non-present sections would fix the issue.
> > 
> > Makes sense to me.
> > 
> >> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > 
> > That pfn inc back and forth is quite ugly TBH but whatever.
> 
> Indeed, can we please rewrite the loop to fix that?

Any suggestions?

I don't see an obvious way to not break readablity in another place.
David Hildenbrand Jan. 10, 2020, 2:34 p.m. UTC | #6
On 10.01.20 14:45, Kirill A. Shutemov wrote:
> On Fri, Jan 10, 2020 at 02:15:26PM +0100, David Hildenbrand wrote:
>> On 08.01.20 15:40, Michal Hocko wrote:
>>> On Mon 30-12-19 12:38:28, Kirill A. Shutemov wrote:
>>>> memmap_init_zone() can be called on the ranges with holes during the
>>>> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
>>>> as holes are not too big.
>>>>
>>>> But huge holes in the memory map causes a problem. It takes over 20
>>>> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
>>>> larger holes in the memory map which would practically hang the system.
>>>>
>>>> Deferred struct page init doesn't help here. It only works on the
>>>> present ranges.
>>>>
>>>> Skipping non-present sections would fix the issue.
>>>
>>> Makes sense to me.
>>>
>>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>>
>>> That pfn inc back and forth is quite ugly TBH but whatever.
>>
>> Indeed, can we please rewrite the loop to fix that?
> 
> Any suggestions?
> 
> I don't see an obvious way to not break readablity in another place.
> 

I'd probably do it like this (applied some other tweaks, untested)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb766aac6772..a96b1ad1d74b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5859,6 +5859,22 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
        return false;
 }
 
+static inline __meminit unsigned long next_present_pfn(unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+       unsigned long section_nr = pfn_to_section_nr(pfn + 1);
+
+       /*
+        * Note: We don't check the subsection bitmap, so this can produce
+        * false positives when only subsections are present/valid. The
+        * caller should recheck if the returned pfn is valid.
+        */
+       if (!present_section_nr(section_nr))
+               return section_nr_to_pfn(next_present_section_nr(section_nr));
+#endif
+       return pfn++;
+}
+
 /*
  * Initially all pages are reserved - free ones are freed
  * up by memblock_free_all() once the early boot process is
@@ -5892,18 +5908,22 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        }
 #endif
 
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+       pfn = start_pfn;
+       while (pfn < end_pfn) {
                /*
                 * There can be holes in boot-time mem_map[]s handed to this
                 * function.  They do not exist on hotplugged memory.
                 */
                if (context == MEMMAP_EARLY) {
-                       if (!early_pfn_valid(pfn))
+                       if (!early_pfn_valid(pfn)) {
+                               pfn = next_present_pfn(pfn, end_pfn);
                                continue;
-                       if (!early_pfn_in_nid(pfn, nid))
-                               continue;
-                       if (overlap_memmap_init(zone, &pfn))
+                       }
+                       if (!early_pfn_in_nid(pfn, nid) ||
+                           overlap_memmap_init(zone, &pfn)) {
+                               pfn++;
                                continue;
+                       }
                        if (defer_init(nid, pfn, end_pfn))
                                break;
                }
@@ -5929,6 +5949,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                        cond_resched();
                }
+               pfn++;
        }


I played with using a "pfn = next_init_pfn()" in the for loop instead, moving all
the checks in there, but didn't turn out too well.
Kirill A. Shutemov Jan. 10, 2020, 2:47 p.m. UTC | #7
On Fri, Jan 10, 2020 at 03:34:49PM +0100, David Hildenbrand wrote:
> On 10.01.20 14:45, Kirill A. Shutemov wrote:
> > On Fri, Jan 10, 2020 at 02:15:26PM +0100, David Hildenbrand wrote:
> >> On 08.01.20 15:40, Michal Hocko wrote:
> >>> On Mon 30-12-19 12:38:28, Kirill A. Shutemov wrote:
> >>>> memmap_init_zone() can be called on the ranges with holes during the
> >>>> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
> >>>> as holes are not too big.
> >>>>
> >>>> But huge holes in the memory map causes a problem. It takes over 20
> >>>> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
> >>>> larger holes in the memory map which would practically hang the system.
> >>>>
> >>>> Deferred struct page init doesn't help here. It only works on the
> >>>> present ranges.
> >>>>
> >>>> Skipping non-present sections would fix the issue.
> >>>
> >>> Makes sense to me.
> >>>
> >>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> >>>
> >>> That pfn inc back and forth is quite ugly TBH but whatever.
> >>
> >> Indeed, can we please rewrite the loop to fix that?
> > 
> > Any suggestions?
> > 
> > I don't see an obvious way to not break readablity in another place.
> > 
> 
> I'd probably do it like this (applied some other tweaks, untested)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index cb766aac6772..a96b1ad1d74b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5859,6 +5859,22 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>         return false;
>  }
>  
> +static inline __meminit unsigned long next_present_pfn(unsigned long pfn)
> +{
> +#ifdef CONFIG_SPARSEMEM

I would rather keep it around function, but it's matter of taste.

> +       unsigned long section_nr = pfn_to_section_nr(pfn + 1);
> +
> +       /*
> +        * Note: We don't check the subsection bitmap, so this can produce
> +        * false positives when only subsections are present/valid. The
> +        * caller should recheck if the returned pfn is valid.
> +        */
> +       if (!present_section_nr(section_nr))
> +               return section_nr_to_pfn(next_present_section_nr(section_nr));

This won't compile. next_present_section_nr() is static to mm/sparse.c.

> +#endif
> +       return pfn++;
> +}
> +
>  /*
>   * Initially all pages are reserved - free ones are freed
>   * up by memblock_free_all() once the early boot process is
> @@ -5892,18 +5908,22 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>         }
>  #endif
>  
> -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +       pfn = start_pfn;
> +       while (pfn < end_pfn) {
>                 /*
>                  * There can be holes in boot-time mem_map[]s handed to this
>                  * function.  They do not exist on hotplugged memory.
>                  */
>                 if (context == MEMMAP_EARLY) {
> -                       if (!early_pfn_valid(pfn))
> +                       if (!early_pfn_valid(pfn)) {
> +                               pfn = next_present_pfn(pfn, end_pfn);
>                                 continue;
> -                       if (!early_pfn_in_nid(pfn, nid))
> -                               continue;
> -                       if (overlap_memmap_init(zone, &pfn))
> +                       }
> +                       if (!early_pfn_in_nid(pfn, nid) ||
> +                           overlap_memmap_init(zone, &pfn)) {
> +                               pfn++;
>                                 continue;
> +                       }
>                         if (defer_init(nid, pfn, end_pfn))
>                                 break;
>                 }
> @@ -5929,6 +5949,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
>                         cond_resched();
>                 }
> +               pfn++;
>         }
> 
> 
> I played with using a "pfn = next_init_pfn()" in the for loop instead, moving all
> the checks in there, but didn't turn out too well.

Well, it's better than I thought, but... I'm fine either way.
David Hildenbrand Jan. 10, 2020, 2:48 p.m. UTC | #8
On 10.01.20 15:47, Kirill A. Shutemov wrote:
> On Fri, Jan 10, 2020 at 03:34:49PM +0100, David Hildenbrand wrote:
>> On 10.01.20 14:45, Kirill A. Shutemov wrote:
>>> On Fri, Jan 10, 2020 at 02:15:26PM +0100, David Hildenbrand wrote:
>>>> On 08.01.20 15:40, Michal Hocko wrote:
>>>>> On Mon 30-12-19 12:38:28, Kirill A. Shutemov wrote:
>>>>>> memmap_init_zone() can be called on the ranges with holes during the
>>>>>> boot. It will skip any non-valid PFNs one-by-one. It works fine as long
>>>>>> as holes are not too big.
>>>>>>
>>>>>> But huge holes in the memory map causes a problem. It takes over 20
>>>>>> seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
>>>>>> larger holes in the memory map which would practically hang the system.
>>>>>>
>>>>>> Deferred struct page init doesn't help here. It only works on the
>>>>>> present ranges.
>>>>>>
>>>>>> Skipping non-present sections would fix the issue.
>>>>>
>>>>> Makes sense to me.
>>>>>
>>>>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>>>>
>>>>> That pfn inc back and forth is quite ugly TBH but whatever.
>>>>
>>>> Indeed, can we please rewrite the loop to fix that?
>>>
>>> Any suggestions?
>>>
>>> I don't see an obvious way to not break readablity in another place.
>>>
>>
>> I'd probably do it like this (applied some other tweaks, untested)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index cb766aac6772..a96b1ad1d74b 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -5859,6 +5859,22 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>>         return false;
>>  }
>>  
>> +static inline __meminit unsigned long next_present_pfn(unsigned long pfn)
>> +{
>> +#ifdef CONFIG_SPARSEMEM
> 
> I would rather keep it around function, but it's matter of taste.

Yes

> 
>> +       unsigned long section_nr = pfn_to_section_nr(pfn + 1);
>> +
>> +       /*
>> +        * Note: We don't check the subsection bitmap, so this can produce
>> +        * false positives when only subsections are present/valid. The
>> +        * caller should recheck if the returned pfn is valid.
>> +        */
>> +       if (!present_section_nr(section_nr))
>> +               return section_nr_to_pfn(next_present_section_nr(section_nr));
> 
> This won't compile. next_present_section_nr() is static to mm/sparse.c.

We should then move that to the header IMHO.
Kirill A. Shutemov Jan. 10, 2020, 2:54 p.m. UTC | #9
On Fri, Jan 10, 2020 at 03:48:39PM +0100, David Hildenbrand wrote:
> >> +       if (!present_section_nr(section_nr))
> >> +               return section_nr_to_pfn(next_present_section_nr(section_nr));
> > 
> > This won't compile. next_present_section_nr() is static to mm/sparse.c.
> 
> We should then move that to the header IMHO.

It looks like too much for a trivial cleanup.
David Hildenbrand Jan. 10, 2020, 2:56 p.m. UTC | #10
On 10.01.20 15:54, Kirill A. Shutemov wrote:
> On Fri, Jan 10, 2020 at 03:48:39PM +0100, David Hildenbrand wrote:
>>>> +       if (!present_section_nr(section_nr))
>>>> +               return section_nr_to_pfn(next_present_section_nr(section_nr));
>>>
>>> This won't compile. next_present_section_nr() is static to mm/sparse.c.
>>
>> We should then move that to the header IMHO.
> 
> It looks like too much for a trivial cleanup.
> 

Cleanup? This is a performance improvement ("fix the issue."). We should
avoid duplicating code where it can be avoided.
Kirill A. Shutemov Jan. 10, 2020, 5:55 p.m. UTC | #11
On Fri, Jan 10, 2020 at 03:56:14PM +0100, David Hildenbrand wrote:
> On 10.01.20 15:54, Kirill A. Shutemov wrote:
> > On Fri, Jan 10, 2020 at 03:48:39PM +0100, David Hildenbrand wrote:
> >>>> +       if (!present_section_nr(section_nr))
> >>>> +               return section_nr_to_pfn(next_present_section_nr(section_nr));
> >>>
> >>> This won't compile. next_present_section_nr() is static to mm/sparse.c.
> >>
> >> We should then move that to the header IMHO.
> > 
> > It looks like too much for a trivial cleanup.
> > 
> 
> Cleanup? This is a performance improvement ("fix the issue."). We should
> avoid duplicating code where it can be avoided.

My original patch is in -mm tree and fixes the issue. The thread is about
tiding it up.
David Hildenbrand Jan. 10, 2020, 6:05 p.m. UTC | #12
On 10.01.20 18:55, Kirill A. Shutemov wrote:
> On Fri, Jan 10, 2020 at 03:56:14PM +0100, David Hildenbrand wrote:
>> On 10.01.20 15:54, Kirill A. Shutemov wrote:
>>> On Fri, Jan 10, 2020 at 03:48:39PM +0100, David Hildenbrand wrote:
>>>>>> +       if (!present_section_nr(section_nr))
>>>>>> +               return section_nr_to_pfn(next_present_section_nr(section_nr));
>>>>>
>>>>> This won't compile. next_present_section_nr() is static to mm/sparse.c.
>>>>
>>>> We should then move that to the header IMHO.
>>>
>>> It looks like too much for a trivial cleanup.
>>>
>>
>> Cleanup? This is a performance improvement ("fix the issue."). We should
>> avoid duplicating code where it can be avoided.
> 
> My original patch is in -mm tree and fixes the issue. The thread is about
> tiding it up.

Just send a v2? This thread is review of this patch.

If you don't want to clean it up, I can send patches ...
Kirill A. Shutemov Jan. 10, 2020, 6:22 p.m. UTC | #13
On Fri, Jan 10, 2020 at 07:05:19PM +0100, David Hildenbrand wrote:
> On 10.01.20 18:55, Kirill A. Shutemov wrote:
> > On Fri, Jan 10, 2020 at 03:56:14PM +0100, David Hildenbrand wrote:
> >> On 10.01.20 15:54, Kirill A. Shutemov wrote:
> >>> On Fri, Jan 10, 2020 at 03:48:39PM +0100, David Hildenbrand wrote:
> >>>>>> +       if (!present_section_nr(section_nr))
> >>>>>> +               return section_nr_to_pfn(next_present_section_nr(section_nr));
> >>>>>
> >>>>> This won't compile. next_present_section_nr() is static to mm/sparse.c.
> >>>>
> >>>> We should then move that to the header IMHO.
> >>>
> >>> It looks like too much for a trivial cleanup.
> >>>
> >>
> >> Cleanup? This is a performance improvement ("fix the issue."). We should
> >> avoid duplicating code where it can be avoided.
> > 
> > My original patch is in -mm tree and fixes the issue. The thread is about
> > tiding it up.
> 
> Just send a v2? This thread is review of this patch.
> 
> If you don't want to clean it up, I can send patches ...

Please send cleanups on top. My patch is less intrusive and easier to
backport if needed.
diff mbox series

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index ac08e6360437..f5f2258092e1 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1159,13 +1159,14 @@  void pc_memory_init(PCMachineState *pcms,
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
     e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
     if (x86ms->above_4g_mem_size > 0) {
+        int shift = 45;
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
         memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
                                  x86ms->below_4g_mem_size,
                                  x86ms->above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        memory_region_add_subregion(system_memory, 1ULL << shift,
                                     ram_above_4g);
-        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+        e820_add_entry(1ULL << shift, x86ms->above_4g_mem_size, E820_RAM);
     }
 
     if (!pcmc->has_reserved_memory &&
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index cde2a16b941a..694c26947bf6 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1928,7 +1928,7 @@  uint64_t cpu_get_tsc(CPUX86State *env);
 /* XXX: This value should match the one returned by CPUID
  * and in exec.c */
 # if defined(TARGET_X86_64)
-# define TCG_PHYS_ADDR_BITS 40
+# define TCG_PHYS_ADDR_BITS 52
 # else
 # define TCG_PHYS_ADDR_BITS 36
 # endif

---
 mm/page_alloc.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df62a49cd09e..442dc0244bb4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5873,6 +5873,30 @@  overlap_memmap_init(unsigned long zone, unsigned long *pfn)
 	return false;
 }
 
+#ifdef CONFIG_SPARSEMEM
+/* Skip PFNs that belong to non-present sections */
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+	unsigned long section_nr;
+
+	section_nr = pfn_to_section_nr(++pfn);
+	if (present_section_nr(section_nr))
+		return pfn;
+
+	while (++section_nr <= __highest_present_section_nr) {
+		if (present_section_nr(section_nr))
+			return section_nr_to_pfn(section_nr);
+	}
+
+	return -1;
+}
+#else
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+	return pfn++;
+}
+#endif
+
 /*
  * Initially all pages are reserved - free ones are freed
  * up by memblock_free_all() once the early boot process is
@@ -5912,8 +5936,10 @@  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		 * function.  They do not exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
-			if (!early_pfn_valid(pfn))
+			if (!early_pfn_valid(pfn)) {
+				pfn = next_pfn(pfn) - 1;
 				continue;
+			}
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 			if (overlap_memmap_init(zone, &pfn))