diff mbox series

[RFC/RFT,1/3] memblock: update initialization of reserved pages

Message ID 20210407172607.8812-2-rppt@kernel.org (mailing list archive)
State New, archived
Headers show
Series arm64: drop pfn_valid_within() and simplify pfn_valid() | expand

Commit Message

Mike Rapoport April 7, 2021, 5:26 p.m. UTC
From: Mike Rapoport <rppt@linux.ibm.com>

The struct pages representing a reserved memory region are initialized
using reserve_bootmem_range() function. This function is called for each
reserved region just before the memory is freed from memblock to the buddy
page allocator.

The struct pages for MEMBLOCK_NOMAP regions are kept with the default
values set by the memory map initialization which makes it necessary to
have a special treatment for such pages in pfn_valid() and
pfn_valid_within().

Split out initialization of the reserved pages to a function with a
meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
reserved regions and mark struct pages for the NOMAP regions as
PageReserved.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 mm/memblock.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

Comments

Anshuman Khandual April 8, 2021, 5:16 a.m. UTC | #1
On 4/7/21 10:56 PM, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> The struct pages representing a reserved memory region are initialized
> using reserve_bootmem_range() function. This function is called for each
> reserved region just before the memory is freed from memblock to the buddy
> page allocator.
> 
> The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> values set by the memory map initialization which makes it necessary to
> have a special treatment for such pages in pfn_valid() and
> pfn_valid_within().
> 
> Split out initialization of the reserved pages to a function with a
> meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
> reserved regions and mark struct pages for the NOMAP regions as
> PageReserved.

This would definitely need updating the comment for MEMBLOCK_NOMAP definition
in include/linux/memblock.h just to make the semantics is clear, though arm64
is currently the only user for MEMBLOCK_NOMAP.

> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  mm/memblock.c | 23 +++++++++++++++++++++--
>  1 file changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/memblock.c b/mm/memblock.c
> index afaefa8fc6ab..6b7ea9d86310 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -2002,6 +2002,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
>  	return end_pfn - start_pfn;
>  }
>  
> +static void __init memmap_init_reserved_pages(void)
> +{
> +	struct memblock_region *region;
> +	phys_addr_t start, end;
> +	u64 i;
> +
> +	/* initialize struct pages for the reserved regions */
> +	for_each_reserved_mem_range(i, &start, &end)
> +		reserve_bootmem_region(start, end);
> +
> +	/* and also treat struct pages for the NOMAP regions as PageReserved */
> +	for_each_mem_region(region) {
> +		if (memblock_is_nomap(region)) {
> +			start = region->base;
> +			end = start + region->size;
> +			reserve_bootmem_region(start, end);
> +		}
> +	}
> +}
> +
>  static unsigned long __init free_low_memory_core_early(void)
>  {
>  	unsigned long count = 0;
> @@ -2010,8 +2030,7 @@ static unsigned long __init free_low_memory_core_early(void)
>  
>  	memblock_clear_hotplug(0, -1);
>  
> -	for_each_reserved_mem_range(i, &start, &end)
> -		reserve_bootmem_region(start, end);
> +	memmap_init_reserved_pages();
>  
>  	/*
>  	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
>
Mike Rapoport April 8, 2021, 5:48 a.m. UTC | #2
On Thu, Apr 08, 2021 at 10:46:18AM +0530, Anshuman Khandual wrote:
> 
> 
> On 4/7/21 10:56 PM, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > The struct pages representing a reserved memory region are initialized
> > using reserve_bootmem_range() function. This function is called for each
> > reserved region just before the memory is freed from memblock to the buddy
> > page allocator.
> > 
> > The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> > values set by the memory map initialization which makes it necessary to
> > have a special treatment for such pages in pfn_valid() and
> > pfn_valid_within().
> > 
> > Split out initialization of the reserved pages to a function with a
> > meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
> > reserved regions and mark struct pages for the NOMAP regions as
> > PageReserved.
> 
> This would definitely need updating the comment for MEMBLOCK_NOMAP definition
> in include/linux/memblock.h just to make the semantics is clear,

Sure

> though arm64 is currently the only user for MEMBLOCK_NOMAP.

> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  mm/memblock.c | 23 +++++++++++++++++++++--
> >  1 file changed, 21 insertions(+), 2 deletions(-)
> > 
> > diff --git a/mm/memblock.c b/mm/memblock.c
> > index afaefa8fc6ab..6b7ea9d86310 100644
> > --- a/mm/memblock.c
> > +++ b/mm/memblock.c
> > @@ -2002,6 +2002,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
> >  	return end_pfn - start_pfn;
> >  }
> >  
> > +static void __init memmap_init_reserved_pages(void)
> > +{
> > +	struct memblock_region *region;
> > +	phys_addr_t start, end;
> > +	u64 i;
> > +
> > +	/* initialize struct pages for the reserved regions */
> > +	for_each_reserved_mem_range(i, &start, &end)
> > +		reserve_bootmem_region(start, end);
> > +
> > +	/* and also treat struct pages for the NOMAP regions as PageReserved */
> > +	for_each_mem_region(region) {
> > +		if (memblock_is_nomap(region)) {
> > +			start = region->base;
> > +			end = start + region->size;
> > +			reserve_bootmem_region(start, end);
> > +		}
> > +	}
> > +}
> > +
> >  static unsigned long __init free_low_memory_core_early(void)
> >  {
> >  	unsigned long count = 0;
> > @@ -2010,8 +2030,7 @@ static unsigned long __init free_low_memory_core_early(void)
> >  
> >  	memblock_clear_hotplug(0, -1);
> >  
> > -	for_each_reserved_mem_range(i, &start, &end)
> > -		reserve_bootmem_region(start, end);
> > +	memmap_init_reserved_pages();
> >  
> >  	/*
> >  	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
> >
David Hildenbrand April 14, 2021, 3:12 p.m. UTC | #3
On 07.04.21 19:26, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> The struct pages representing a reserved memory region are initialized
> using reserve_bootmem_range() function. This function is called for each
> reserved region just before the memory is freed from memblock to the buddy
> page allocator.
> 
> The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> values set by the memory map initialization which makes it necessary to
> have a special treatment for such pages in pfn_valid() and
> pfn_valid_within().

I assume these pages are never given to the buddy, because we don't have 
a direct mapping. So to the kernel, it's essentially just like a memory 
hole with benefits.

I can spot that we want to export such memory like any special memory 
thingy/hole in /proc/iomem -- "reserved", which makes sense.

I would assume that MEMBLOCK_NOMAP is a special type of *reserved* 
memory. IOW, that for_each_reserved_mem_range() should already succeed 
on these as well -- we should mark anything that is MEMBLOCK_NOMAP 
implicitly as reserved. Or are there valid reasons not to do so? What 
can anyone do with that memory?

I assume they are pretty much useless for the kernel, right? Like other 
reserved memory ranges.


> 
> Split out initialization of the reserved pages to a function with a
> meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
> reserved regions and mark struct pages for the NOMAP regions as
> PageReserved.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   mm/memblock.c | 23 +++++++++++++++++++++--
>   1 file changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/memblock.c b/mm/memblock.c
> index afaefa8fc6ab..6b7ea9d86310 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -2002,6 +2002,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
>   	return end_pfn - start_pfn;
>   }
>   
> +static void __init memmap_init_reserved_pages(void)
> +{
> +	struct memblock_region *region;
> +	phys_addr_t start, end;
> +	u64 i;
> +
> +	/* initialize struct pages for the reserved regions */
> +	for_each_reserved_mem_range(i, &start, &end)
> +		reserve_bootmem_region(start, end);
> +
> +	/* and also treat struct pages for the NOMAP regions as PageReserved */
> +	for_each_mem_region(region) {
> +		if (memblock_is_nomap(region)) {
> +			start = region->base;
> +			end = start + region->size;
> +			reserve_bootmem_region(start, end);
> +		}
> +	}
> +}
> +
>   static unsigned long __init free_low_memory_core_early(void)
>   {
>   	unsigned long count = 0;
> @@ -2010,8 +2030,7 @@ static unsigned long __init free_low_memory_core_early(void)
>   
>   	memblock_clear_hotplug(0, -1);
>   
> -	for_each_reserved_mem_range(i, &start, &end)
> -		reserve_bootmem_region(start, end);
> +	memmap_init_reserved_pages();
>   
>   	/*
>   	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
>
Ard Biesheuvel April 14, 2021, 3:27 p.m. UTC | #4
On Wed, 14 Apr 2021 at 17:14, David Hildenbrand <david@redhat.com> wrote:
>
> On 07.04.21 19:26, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> >
> > The struct pages representing a reserved memory region are initialized
> > using reserve_bootmem_range() function. This function is called for each
> > reserved region just before the memory is freed from memblock to the buddy
> > page allocator.
> >
> > The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> > values set by the memory map initialization which makes it necessary to
> > have a special treatment for such pages in pfn_valid() and
> > pfn_valid_within().
>
> I assume these pages are never given to the buddy, because we don't have
> a direct mapping. So to the kernel, it's essentially just like a memory
> hole with benefits.
>
> I can spot that we want to export such memory like any special memory
> thingy/hole in /proc/iomem -- "reserved", which makes sense.
>
> I would assume that MEMBLOCK_NOMAP is a special type of *reserved*
> memory. IOW, that for_each_reserved_mem_range() should already succeed
> on these as well -- we should mark anything that is MEMBLOCK_NOMAP
> implicitly as reserved. Or are there valid reasons not to do so? What
> can anyone do with that memory?
>
> I assume they are pretty much useless for the kernel, right? Like other
> reserved memory ranges.
>

On ARM, we need to know whether any physical regions that do not
contain system memory contain something with device semantics or not.
One of the examples is ACPI tables: these are in reserved memory, and
so they are not covered by the linear region. However, when the ACPI
core ioremap()s an arbitrary memory region, we don't know whether it
is mapping a memory region or a device region unless we keep track of
this in some way. (Device mappings require device attributes, but
firmware tables require memory attributes, as they might be accessed
using misaligned reads)


>
> >
> > Split out initialization of the reserved pages to a function with a
> > meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
> > reserved regions and mark struct pages for the NOMAP regions as
> > PageReserved.
> >
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >   mm/memblock.c | 23 +++++++++++++++++++++--
> >   1 file changed, 21 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memblock.c b/mm/memblock.c
> > index afaefa8fc6ab..6b7ea9d86310 100644
> > --- a/mm/memblock.c
> > +++ b/mm/memblock.c
> > @@ -2002,6 +2002,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
> >       return end_pfn - start_pfn;
> >   }
> >
> > +static void __init memmap_init_reserved_pages(void)
> > +{
> > +     struct memblock_region *region;
> > +     phys_addr_t start, end;
> > +     u64 i;
> > +
> > +     /* initialize struct pages for the reserved regions */
> > +     for_each_reserved_mem_range(i, &start, &end)
> > +             reserve_bootmem_region(start, end);
> > +
> > +     /* and also treat struct pages for the NOMAP regions as PageReserved */
> > +     for_each_mem_region(region) {
> > +             if (memblock_is_nomap(region)) {
> > +                     start = region->base;
> > +                     end = start + region->size;
> > +                     reserve_bootmem_region(start, end);
> > +             }
> > +     }
> > +}
> > +
> >   static unsigned long __init free_low_memory_core_early(void)
> >   {
> >       unsigned long count = 0;
> > @@ -2010,8 +2030,7 @@ static unsigned long __init free_low_memory_core_early(void)
> >
> >       memblock_clear_hotplug(0, -1);
> >
> > -     for_each_reserved_mem_range(i, &start, &end)
> > -             reserve_bootmem_region(start, end);
> > +     memmap_init_reserved_pages();
> >
> >       /*
> >        * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
> >
>
>
> --
> Thanks,
>
> David / dhildenb
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
David Hildenbrand April 14, 2021, 3:52 p.m. UTC | #5
On 14.04.21 17:27, Ard Biesheuvel wrote:
> On Wed, 14 Apr 2021 at 17:14, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 07.04.21 19:26, Mike Rapoport wrote:
>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>
>>> The struct pages representing a reserved memory region are initialized
>>> using reserve_bootmem_range() function. This function is called for each
>>> reserved region just before the memory is freed from memblock to the buddy
>>> page allocator.
>>>
>>> The struct pages for MEMBLOCK_NOMAP regions are kept with the default
>>> values set by the memory map initialization which makes it necessary to
>>> have a special treatment for such pages in pfn_valid() and
>>> pfn_valid_within().
>>
>> I assume these pages are never given to the buddy, because we don't have
>> a direct mapping. So to the kernel, it's essentially just like a memory
>> hole with benefits.
>>
>> I can spot that we want to export such memory like any special memory
>> thingy/hole in /proc/iomem -- "reserved", which makes sense.
>>
>> I would assume that MEMBLOCK_NOMAP is a special type of *reserved*
>> memory. IOW, that for_each_reserved_mem_range() should already succeed
>> on these as well -- we should mark anything that is MEMBLOCK_NOMAP
>> implicitly as reserved. Or are there valid reasons not to do so? What
>> can anyone do with that memory?
>>
>> I assume they are pretty much useless for the kernel, right? Like other
>> reserved memory ranges.
>>
> 
> On ARM, we need to know whether any physical regions that do not
> contain system memory contain something with device semantics or not.
> One of the examples is ACPI tables: these are in reserved memory, and
> so they are not covered by the linear region. However, when the ACPI
> core ioremap()s an arbitrary memory region, we don't know whether it
> is mapping a memory region or a device region unless we keep track of
> this in some way. (Device mappings require device attributes, but
> firmware tables require memory attributes, as they might be accessed
> using misaligned reads)

Using generically sounding NOMAP ("don't create direct mapping") to 
identify device regions feels like a hack. I know, it was introduced 
just for that purpose.

Looking at memblock_mark_nomap(), we consider "device regions"

1) ACPI tables

2) VIDEO_TYPE_EFI memory

3) some device-tree regions in of/fdt.c


IIUC, right now we end up creating a memmap for this NOMAP memory, but 
hide it away in pfn_valid(). This patch set at least fixes that.

Assuming these pages are never mapped to user space via the struct page 
(which better be the case), we could further use a new pagetype to mark 
these pages in a special way, such that we can identify them directly 
via pfn_to_page().

Then, we could mostly avoid having to query memblock at runtime to 
figure out that this is special memory. This would obviously be an 
extension to this series. Just a thought.
Mike Rapoport April 14, 2021, 8:06 p.m. UTC | #6
On Wed, Apr 14, 2021 at 05:12:11PM +0200, David Hildenbrand wrote:
> On 07.04.21 19:26, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > The struct pages representing a reserved memory region are initialized
> > using reserve_bootmem_range() function. This function is called for each
> > reserved region just before the memory is freed from memblock to the buddy
> > page allocator.
> > 
> > The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> > values set by the memory map initialization which makes it necessary to
> > have a special treatment for such pages in pfn_valid() and
> > pfn_valid_within().
> 
> I assume these pages are never given to the buddy, because we don't have a
> direct mapping. So to the kernel, it's essentially just like a memory hole
> with benefits.

The pages should not be accessed as normal memory so they do not have a
direct (or in ARMish linear) mapping and are never given to buddy. 
After looking at ACPI standard I don't see a fundamental reason for this
but they've already made this mess and we need to cope with it.
 
> I can spot that we want to export such memory like any special memory
> thingy/hole in /proc/iomem -- "reserved", which makes sense.

It does, but let's wait with /proc/iomem changes. We don't really have a
100% consistent view of it on different architectures, so adding yet
another type there does not seem, well, urgent.
 
> I would assume that MEMBLOCK_NOMAP is a special type of *reserved* memory.
> IOW, that for_each_reserved_mem_range() should already succeed on these as
> well -- we should mark anything that is MEMBLOCK_NOMAP implicitly as
> reserved. Or are there valid reasons not to do so? What can anyone do with
> that memory?
> 
> I assume they are pretty much useless for the kernel, right? Like other
> reserved memory ranges.

I agree that there is a lot of commonality between NOMAP and reserved. The
problem is that even semantics for reserved is different between
architectures. Moreover, on the same architecture there could be
E820_TYPE_RESERVED and memblock.reserved with different properties.

I'd really prefer moving in baby steps here because any change in the boot
mm can bear several month of early hangs debugging ;-)

> > Split out initialization of the reserved pages to a function with a
> > meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
> > reserved regions and mark struct pages for the NOMAP regions as
> > PageReserved.
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >   mm/memblock.c | 23 +++++++++++++++++++++--
> >   1 file changed, 21 insertions(+), 2 deletions(-)
> > 
> > diff --git a/mm/memblock.c b/mm/memblock.c
> > index afaefa8fc6ab..6b7ea9d86310 100644
> > --- a/mm/memblock.c
> > +++ b/mm/memblock.c
> > @@ -2002,6 +2002,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
> >   	return end_pfn - start_pfn;
> >   }
> > +static void __init memmap_init_reserved_pages(void)
> > +{
> > +	struct memblock_region *region;
> > +	phys_addr_t start, end;
> > +	u64 i;
> > +
> > +	/* initialize struct pages for the reserved regions */
> > +	for_each_reserved_mem_range(i, &start, &end)
> > +		reserve_bootmem_region(start, end);
> > +
> > +	/* and also treat struct pages for the NOMAP regions as PageReserved */
> > +	for_each_mem_region(region) {
> > +		if (memblock_is_nomap(region)) {
> > +			start = region->base;
> > +			end = start + region->size;
> > +			reserve_bootmem_region(start, end);
> > +		}
> > +	}
> > +}
> > +
> >   static unsigned long __init free_low_memory_core_early(void)
> >   {
> >   	unsigned long count = 0;
> > @@ -2010,8 +2030,7 @@ static unsigned long __init free_low_memory_core_early(void)
> >   	memblock_clear_hotplug(0, -1);
> > -	for_each_reserved_mem_range(i, &start, &end)
> > -		reserve_bootmem_region(start, end);
> > +	memmap_init_reserved_pages();
> >   	/*
> >   	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
David Hildenbrand April 14, 2021, 8:09 p.m. UTC | #7
Mike Rapoport <rppt@kernel.org> schrieb am Mi. 14. Apr. 2021 um 22:06:

> On Wed, Apr 14, 2021 at 05:12:11PM +0200, David Hildenbrand wrote:
> > On 07.04.21 19:26, Mike Rapoport wrote:
> > > From: Mike Rapoport <rppt@linux.ibm.com>
> > >
> > > The struct pages representing a reserved memory region are initialized
> > > using reserve_bootmem_range() function. This function is called for
> each
> > > reserved region just before the memory is freed from memblock to the
> buddy
> > > page allocator.
> > >
> > > The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> > > values set by the memory map initialization which makes it necessary to
> > > have a special treatment for such pages in pfn_valid() and
> > > pfn_valid_within().
> >
> > I assume these pages are never given to the buddy, because we don't have
> a
> > direct mapping. So to the kernel, it's essentially just like a memory
> hole
> > with benefits.
>
> The pages should not be accessed as normal memory so they do not have a
> direct (or in ARMish linear) mapping and are never given to buddy.
> After looking at ACPI standard I don't see a fundamental reason for this
> but they've already made this mess and we need to cope with it.
>
> > I can spot that we want to export such memory like any special memory
> > thingy/hole in /proc/iomem -- "reserved", which makes sense.
>
> It does, but let's wait with /proc/iomem changes. We don't really have a
> 100% consistent view of it on different architectures, so adding yet
> another type there does not seem, well, urgent.
>

To clarify: this is already done on arm64.


> > I would assume that MEMBLOCK_NOMAP is a special type of *reserved*
> memory.
> > IOW, that for_each_reserved_mem_range() should already succeed on these
> as
> > well -- we should mark anything that is MEMBLOCK_NOMAP implicitly as
> > reserved. Or are there valid reasons not to do so? What can anyone do
> with
> > that memory?
> >
> > I assume they are pretty much useless for the kernel, right? Like other
> > reserved memory ranges.
>
> I agree that there is a lot of commonality between NOMAP and reserved. The
> problem is that even semantics for reserved is different between
> architectures. Moreover, on the same architecture there could be
> E820_TYPE_RESERVED and memblock.reserved with different properties.
>
> I'd really prefer moving in baby steps here because any change in the boot
> mm can bear several month of early hangs debugging ;-)


Yeah I know. We just should have the desired target state figured out :)



>
> > > Split out initialization of the reserved pages to a function with a
> > > meaningful name and treat the MEMBLOCK_NOMAP regions the same way as
> the
> > > reserved regions and mark struct pages for the NOMAP regions as
> > > PageReserved.
> > >
> > > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > > ---
> > >   mm/memblock.c | 23 +++++++++++++++++++++--
> > >   1 file changed, 21 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/memblock.c b/mm/memblock.c
> > > index afaefa8fc6ab..6b7ea9d86310 100644
> > > --- a/mm/memblock.c
> > > +++ b/mm/memblock.c
> > > @@ -2002,6 +2002,26 @@ static unsigned long __init
> __free_memory_core(phys_addr_t start,
> > >     return end_pfn - start_pfn;
> > >   }
> > > +static void __init memmap_init_reserved_pages(void)
> > > +{
> > > +   struct memblock_region *region;
> > > +   phys_addr_t start, end;
> > > +   u64 i;
> > > +
> > > +   /* initialize struct pages for the reserved regions */
> > > +   for_each_reserved_mem_range(i, &start, &end)
> > > +           reserve_bootmem_region(start, end);
> > > +
> > > +   /* and also treat struct pages for the NOMAP regions as
> PageReserved */
> > > +   for_each_mem_region(region) {
> > > +           if (memblock_is_nomap(region)) {
> > > +                   start = region->base;
> > > +                   end = start + region->size;
> > > +                   reserve_bootmem_region(start, end);
> > > +           }
> > > +   }
> > > +}
> > > +
> > >   static unsigned long __init free_low_memory_core_early(void)
> > >   {
> > >     unsigned long count = 0;
> > > @@ -2010,8 +2030,7 @@ static unsigned long __init
> free_low_memory_core_early(void)
> > >     memblock_clear_hotplug(0, -1);
> > > -   for_each_reserved_mem_range(i, &start, &end)
> > > -           reserve_bootmem_region(start, end);
> > > +   memmap_init_reserved_pages();
> > >     /*
> > >      * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
>
> --
> Sincerely yours,
> Mike.
>
> --
Thanks,

David / dhildenb
Mike Rapoport April 14, 2021, 8:11 p.m. UTC | #8
On Wed, Apr 14, 2021 at 05:27:53PM +0200, Ard Biesheuvel wrote:
> On Wed, 14 Apr 2021 at 17:14, David Hildenbrand <david@redhat.com> wrote:
> >
> > On 07.04.21 19:26, Mike Rapoport wrote:
> > > From: Mike Rapoport <rppt@linux.ibm.com>
> > >
> > > The struct pages representing a reserved memory region are initialized
> > > using reserve_bootmem_range() function. This function is called for each
> > > reserved region just before the memory is freed from memblock to the buddy
> > > page allocator.
> > >
> > > The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> > > values set by the memory map initialization which makes it necessary to
> > > have a special treatment for such pages in pfn_valid() and
> > > pfn_valid_within().
> >
> > I assume these pages are never given to the buddy, because we don't have
> > a direct mapping. So to the kernel, it's essentially just like a memory
> > hole with benefits.
> >
> > I can spot that we want to export such memory like any special memory
> > thingy/hole in /proc/iomem -- "reserved", which makes sense.
> >
> > I would assume that MEMBLOCK_NOMAP is a special type of *reserved*
> > memory. IOW, that for_each_reserved_mem_range() should already succeed
> > on these as well -- we should mark anything that is MEMBLOCK_NOMAP
> > implicitly as reserved. Or are there valid reasons not to do so? What
> > can anyone do with that memory?
> >
> > I assume they are pretty much useless for the kernel, right? Like other
> > reserved memory ranges.
> >
> 
> On ARM, we need to know whether any physical regions that do not
> contain system memory contain something with device semantics or not.
> One of the examples is ACPI tables: these are in reserved memory, and
> so they are not covered by the linear region. However, when the ACPI
> core ioremap()s an arbitrary memory region, we don't know whether it
> is mapping a memory region or a device region unless we keep track of
> this in some way. (Device mappings require device attributes, but
> firmware tables require memory attributes, as they might be accessed
> using misaligned reads)

I mostly agree, but my understanding is that regions of *physical* memory
that are occupied by various pieces of EFI/ACPI information require special
treatment because it was defined this way in the APCI spec.
And since ARM cannot tolerate aliased mappings with different caching mode
the whole bunch of firmware memory should be ioremap()ed to access it.

> > > Split out initialization of the reserved pages to a function with a
> > > meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
> > > reserved regions and mark struct pages for the NOMAP regions as
> > > PageReserved.
> > >
> > > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > > ---
> > >   mm/memblock.c | 23 +++++++++++++++++++++--
> > >   1 file changed, 21 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/memblock.c b/mm/memblock.c
> > > index afaefa8fc6ab..6b7ea9d86310 100644
> > > --- a/mm/memblock.c
> > > +++ b/mm/memblock.c
> > > @@ -2002,6 +2002,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
> > >       return end_pfn - start_pfn;
> > >   }
> > >
> > > +static void __init memmap_init_reserved_pages(void)
> > > +{
> > > +     struct memblock_region *region;
> > > +     phys_addr_t start, end;
> > > +     u64 i;
> > > +
> > > +     /* initialize struct pages for the reserved regions */
> > > +     for_each_reserved_mem_range(i, &start, &end)
> > > +             reserve_bootmem_region(start, end);
> > > +
> > > +     /* and also treat struct pages for the NOMAP regions as PageReserved */
> > > +     for_each_mem_region(region) {
> > > +             if (memblock_is_nomap(region)) {
> > > +                     start = region->base;
> > > +                     end = start + region->size;
> > > +                     reserve_bootmem_region(start, end);
> > > +             }
> > > +     }
> > > +}
> > > +
> > >   static unsigned long __init free_low_memory_core_early(void)
> > >   {
> > >       unsigned long count = 0;
> > > @@ -2010,8 +2030,7 @@ static unsigned long __init free_low_memory_core_early(void)
> > >
> > >       memblock_clear_hotplug(0, -1);
> > >
> > > -     for_each_reserved_mem_range(i, &start, &end)
> > > -             reserve_bootmem_region(start, end);
> > > +     memmap_init_reserved_pages();
> > >
> > >       /*
> > >        * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
> > >
> >
> >
> > --
> > Thanks,
> >
> > David / dhildenb
> >
> >
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mike Rapoport April 14, 2021, 8:24 p.m. UTC | #9
On Wed, Apr 14, 2021 at 05:52:57PM +0200, David Hildenbrand wrote:
> On 14.04.21 17:27, Ard Biesheuvel wrote:
> > On Wed, 14 Apr 2021 at 17:14, David Hildenbrand <david@redhat.com> wrote:
> > > 
> > > On 07.04.21 19:26, Mike Rapoport wrote:
> > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > 
> > > > The struct pages representing a reserved memory region are initialized
> > > > using reserve_bootmem_range() function. This function is called for each
> > > > reserved region just before the memory is freed from memblock to the buddy
> > > > page allocator.
> > > > 
> > > > The struct pages for MEMBLOCK_NOMAP regions are kept with the default
> > > > values set by the memory map initialization which makes it necessary to
> > > > have a special treatment for such pages in pfn_valid() and
> > > > pfn_valid_within().
> > > 
> > > I assume these pages are never given to the buddy, because we don't have
> > > a direct mapping. So to the kernel, it's essentially just like a memory
> > > hole with benefits.
> > > 
> > > I can spot that we want to export such memory like any special memory
> > > thingy/hole in /proc/iomem -- "reserved", which makes sense.
> > > 
> > > I would assume that MEMBLOCK_NOMAP is a special type of *reserved*
> > > memory. IOW, that for_each_reserved_mem_range() should already succeed
> > > on these as well -- we should mark anything that is MEMBLOCK_NOMAP
> > > implicitly as reserved. Or are there valid reasons not to do so? What
> > > can anyone do with that memory?
> > > 
> > > I assume they are pretty much useless for the kernel, right? Like other
> > > reserved memory ranges.
> > > 
> > 
> > On ARM, we need to know whether any physical regions that do not
> > contain system memory contain something with device semantics or not.
> > One of the examples is ACPI tables: these are in reserved memory, and
> > so they are not covered by the linear region. However, when the ACPI
> > core ioremap()s an arbitrary memory region, we don't know whether it
> > is mapping a memory region or a device region unless we keep track of
> > this in some way. (Device mappings require device attributes, but
> > firmware tables require memory attributes, as they might be accessed
> > using misaligned reads)
> 
> Using generically sounding NOMAP ("don't create direct mapping") to identify
> device regions feels like a hack. I know, it was introduced just for that
> purpose.
> 
> Looking at memblock_mark_nomap(), we consider "device regions"
> 
> 1) ACPI tables
> 
> 2) VIDEO_TYPE_EFI memory
> 
> 3) some device-tree regions in of/fdt.c
> 
> 
> IIUC, right now we end up creating a memmap for this NOMAP memory, but hide
> it away in pfn_valid(). This patch set at least fixes that.

Currently we have memmap entries with struct page set to defaults for the
NOMAP memory. AFAIU hiding them in pfn_valid()/pfn_valid_within() was a
solution to failures in pfn walkers that presumed that for a pfn_valid()
there will be a struct page that really reflects the state of that page.

> Assuming these pages are never mapped to user space via the struct page
> (which better be the case), we could further use a new pagetype to mark
> these pages in a special way, such that we can identify them directly via
> pfn_to_page().

Not sure we really need a new pagetype here, PG_Reserved seems to be quite
enough to say "don't touch this".  I generally agree that we could make
PG_Reserved a PageType and then have several sub-types for reserved memory.
This definitely will add clarity but I'm not sure that this justifies
amount of churn and effort required to audit uses of PageResrved().
 
> Then, we could mostly avoid having to query memblock at runtime to figure
> out that this is special memory. This would obviously be an extension to
> this series. Just a thought. 

Stop pushing memblock out of kernel! ;-)

Now, seriously, we can minimize memblock involvement in run-time and this
series in yet another step in that direction.
David Hildenbrand April 15, 2021, 9:30 a.m. UTC | #10
> Not sure we really need a new pagetype here, PG_Reserved seems to be quite
> enough to say "don't touch this".  I generally agree that we could make
> PG_Reserved a PageType and then have several sub-types for reserved memory.
> This definitely will add clarity but I'm not sure that this justifies
> amount of churn and effort required to audit uses of PageResrved().
>   
>> Then, we could mostly avoid having to query memblock at runtime to figure
>> out that this is special memory. This would obviously be an extension to
>> this series. Just a thought.
> 
> Stop pushing memblock out of kernel! ;-)

Can't stop. Won't stop. :D

It's lovely for booting up a kernel until we have other data-structures 
in place ;)
Mike Rapoport April 16, 2021, 11:44 a.m. UTC | #11
On Thu, Apr 15, 2021 at 11:30:12AM +0200, David Hildenbrand wrote:
> > Not sure we really need a new pagetype here, PG_Reserved seems to be quite
> > enough to say "don't touch this".  I generally agree that we could make
> > PG_Reserved a PageType and then have several sub-types for reserved memory.
> > This definitely will add clarity but I'm not sure that this justifies
> > amount of churn and effort required to audit uses of PageResrved().
> > > Then, we could mostly avoid having to query memblock at runtime to figure
> > > out that this is special memory. This would obviously be an extension to
> > > this series. Just a thought.
> > 
> > Stop pushing memblock out of kernel! ;-)
> 
> Can't stop. Won't stop. :D
> 
> It's lovely for booting up a kernel until we have other data-structures in
> place ;)

A bit more seriously, we don't have any data structure that reliably
represents physical memory layout and arch-independent fashion. 
memblock is probably the best starting point for eventually having one.
David Hildenbrand April 16, 2021, 11:54 a.m. UTC | #12
On 16.04.21 13:44, Mike Rapoport wrote:
> On Thu, Apr 15, 2021 at 11:30:12AM +0200, David Hildenbrand wrote:
>>> Not sure we really need a new pagetype here, PG_Reserved seems to be quite
>>> enough to say "don't touch this".  I generally agree that we could make
>>> PG_Reserved a PageType and then have several sub-types for reserved memory.
>>> This definitely will add clarity but I'm not sure that this justifies
>>> amount of churn and effort required to audit uses of PageResrved().
>>>> Then, we could mostly avoid having to query memblock at runtime to figure
>>>> out that this is special memory. This would obviously be an extension to
>>>> this series. Just a thought.
>>>
>>> Stop pushing memblock out of kernel! ;-)
>>
>> Can't stop. Won't stop. :D
>>
>> It's lovely for booting up a kernel until we have other data-structures in
>> place ;)
> 
> A bit more seriously, we don't have any data structure that reliably
> represents physical memory layout and arch-independent fashion.
> memblock is probably the best starting point for eventually having one.

We have the (slowish) kernel resource tree after boot and the (faster) 
memmap. I really don't see why we really need another slowish variant.

We might be better off to just extend and speed up the kernel resource tree.

Memblock as is is not a reasonable datastructure to keep around after 
boot: for example, how we handle boottime allocations and reserve 
regions both as reserved.
diff mbox series

Patch

diff --git a/mm/memblock.c b/mm/memblock.c
index afaefa8fc6ab..6b7ea9d86310 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2002,6 +2002,26 @@  static unsigned long __init __free_memory_core(phys_addr_t start,
 	return end_pfn - start_pfn;
 }
 
+static void __init memmap_init_reserved_pages(void)
+{
+	struct memblock_region *region;
+	phys_addr_t start, end;
+	u64 i;
+
+	/* initialize struct pages for the reserved regions */
+	for_each_reserved_mem_range(i, &start, &end)
+		reserve_bootmem_region(start, end);
+
+	/* and also treat struct pages for the NOMAP regions as PageReserved */
+	for_each_mem_region(region) {
+		if (memblock_is_nomap(region)) {
+			start = region->base;
+			end = start + region->size;
+			reserve_bootmem_region(start, end);
+		}
+	}
+}
+
 static unsigned long __init free_low_memory_core_early(void)
 {
 	unsigned long count = 0;
@@ -2010,8 +2030,7 @@  static unsigned long __init free_low_memory_core_early(void)
 
 	memblock_clear_hotplug(0, -1);
 
-	for_each_reserved_mem_range(i, &start, &end)
-		reserve_bootmem_region(start, end);
+	memmap_init_reserved_pages();
 
 	/*
 	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id