diff mbox series

memory_hotplug: fix the panic when memory end is not on the section boundary

Message ID 20180910123527.71209-1-zaslonko@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series memory_hotplug: fix the panic when memory end is not on the section boundary | expand

Commit Message

Zaslonko Mikhail Sept. 10, 2018, 12:35 p.m. UTC
If memory end is not aligned with the linux memory section boundary, such
a section is only partly initialized. This may lead to VM_BUG_ON due to
uninitialized struct pages access from is_mem_section_removable() or
test_pages_in_a_zone() function.

Here is one of the panic examples:
 CONFIG_DEBUG_VM_PGFLAGS=y
 kernel parameter mem=3075M

 page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
 ------------[ cut here ]------------
 Call Trace:
 ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
  [<00000000009558ba>] show_mem_removable+0xda/0xe0
  [<00000000009325fc>] dev_attr_show+0x3c/0x80
  [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
  [<00000000003fc4e0>] seq_read+0x208/0x4c8
  [<00000000003cb80e>] __vfs_read+0x46/0x180
  [<00000000003cb9ce>] vfs_read+0x86/0x148
  [<00000000003cc06a>] ksys_read+0x62/0xc0
  [<0000000000c001c0>] system_call+0xdc/0x2d8

This fix checks if the page lies within the zone boundaries before
accessing the struct page data. The check is added to both functions.
Actually similar check has already been present in
is_pageblock_removable_nolock() function but only after the struct page
is accessed.

Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: <stable@vger.kernel.org>
---
 mm/memory_hotplug.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

Comments

Michal Hocko Sept. 10, 2018, 1:17 p.m. UTC | #1
[Cc Pavel]

On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
> If memory end is not aligned with the linux memory section boundary, such
> a section is only partly initialized. This may lead to VM_BUG_ON due to
> uninitialized struct pages access from is_mem_section_removable() or
> test_pages_in_a_zone() function.
> 
> Here is one of the panic examples:
>  CONFIG_DEBUG_VM_PGFLAGS=y
>  kernel parameter mem=3075M

OK, so the last memory section is not full and we have a partial memory
block right?

>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))

OK, this means that the struct page is not fully initialized. Do you
have a specific place which has triggered this assert?

>  ------------[ cut here ]------------
>  Call Trace:
>  ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
>   [<00000000009558ba>] show_mem_removable+0xda/0xe0
>   [<00000000009325fc>] dev_attr_show+0x3c/0x80
>   [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
>   [<00000000003fc4e0>] seq_read+0x208/0x4c8
>   [<00000000003cb80e>] __vfs_read+0x46/0x180
>   [<00000000003cb9ce>] vfs_read+0x86/0x148
>   [<00000000003cc06a>] ksys_read+0x62/0xc0
>   [<0000000000c001c0>] system_call+0xdc/0x2d8
> 
> This fix checks if the page lies within the zone boundaries before
> accessing the struct page data. The check is added to both functions.
> Actually similar check has already been present in
> is_pageblock_removable_nolock() function but only after the struct page
> is accessed.
> 

Well, I am afraid this is not the proper solution. We are relying on the
full pageblock worth of initialized struct pages at many other place. We
used to do that in the past because we have initialized the full
section but this has been changed recently. Pavel, do you have any ideas
how to deal with this partial mem sections now?

> Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
> Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
> Cc: <stable@vger.kernel.org>
> ---
>  mm/memory_hotplug.c | 20 +++++++++++---------
>  1 file changed, 11 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 9eea6e809a4e..8e20e8fcc3b0 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1229,9 +1229,8 @@ static struct page *next_active_pageblock(struct page *page)
>  	return page + pageblock_nr_pages;
>  }
>  
> -static bool is_pageblock_removable_nolock(struct page *page)
> +static bool is_pageblock_removable_nolock(struct page *page, struct zone **zone)
>  {
> -	struct zone *zone;
>  	unsigned long pfn;
>  
>  	/*
> @@ -1241,15 +1240,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
>  	 * We have to take care about the node as well. If the node is offline
>  	 * its NODE_DATA will be NULL - see page_zone.
>  	 */
> -	if (!node_online(page_to_nid(page)))
> -		return false;
> -
> -	zone = page_zone(page);
>  	pfn = page_to_pfn(page);
> -	if (!zone_spans_pfn(zone, pfn))
> +	if (*zone && !zone_spans_pfn(*zone, pfn))
>  		return false;
> +	if (!node_online(page_to_nid(page)))
> +		return false;
> +	*zone = page_zone(page);
>  
> -	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
> +	return !has_unmovable_pages(*zone, page, 0, MIGRATE_MOVABLE, true);
>  }
>  
>  /* Checks if this range of memory is likely to be hot-removable. */
> @@ -1257,10 +1255,11 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
>  {
>  	struct page *page = pfn_to_page(start_pfn);
>  	struct page *end_page = page + nr_pages;
> +	struct zone *zone = NULL;
>  
>  	/* Check the starting page of each pageblock within the range */
>  	for (; page < end_page; page = next_active_pageblock(page)) {
> -		if (!is_pageblock_removable_nolock(page))
> +		if (!is_pageblock_removable_nolock(page, &zone))
>  			return false;
>  		cond_resched();
>  	}
> @@ -1296,6 +1295,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
>  				i++;
>  			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
>  				continue;
> +			/* Check if we got outside of the zone */
> +			if (zone && !zone_spans_pfn(zone, pfn))
> +				return 0;
>  			page = pfn_to_page(pfn + i);
>  			if (zone && page_zone(page) != zone)
>  				return 0;
> -- 
> 2.16.4
Pasha Tatashin Sept. 10, 2018, 1:46 p.m. UTC | #2
On 9/10/18 9:17 AM, Michal Hocko wrote:
> [Cc Pavel]
> 
> On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
>> If memory end is not aligned with the linux memory section boundary, such
>> a section is only partly initialized. This may lead to VM_BUG_ON due to
>> uninitialized struct pages access from is_mem_section_removable() or
>> test_pages_in_a_zone() function.
>>
>> Here is one of the panic examples:
>>  CONFIG_DEBUG_VM_PGFLAGS=y
>>  kernel parameter mem=3075M
> 
> OK, so the last memory section is not full and we have a partial memory
> block right?
> 
>>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
> 
> OK, this means that the struct page is not fully initialized. Do you
> have a specific place which has triggered this assert?
> 
>>  ------------[ cut here ]------------
>>  Call Trace:
>>  ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
>>   [<00000000009558ba>] show_mem_removable+0xda/0xe0
>>   [<00000000009325fc>] dev_attr_show+0x3c/0x80
>>   [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
>>   [<00000000003fc4e0>] seq_read+0x208/0x4c8
>>   [<00000000003cb80e>] __vfs_read+0x46/0x180
>>   [<00000000003cb9ce>] vfs_read+0x86/0x148
>>   [<00000000003cc06a>] ksys_read+0x62/0xc0
>>   [<0000000000c001c0>] system_call+0xdc/0x2d8
>>
>> This fix checks if the page lies within the zone boundaries before
>> accessing the struct page data. The check is added to both functions.
>> Actually similar check has already been present in
>> is_pageblock_removable_nolock() function but only after the struct page
>> is accessed.
>>
> 
> Well, I am afraid this is not the proper solution. We are relying on the
> full pageblock worth of initialized struct pages at many other place. We
> used to do that in the past because we have initialized the full
> section but this has been changed recently. Pavel, do you have any ideas
> how to deal with this partial mem sections now?

We have:

remove_memory()
	BUG_ON(check_hotplug_memory_range(start, size))

That supposed to safely check for this condition: if [start, start +
size) not block size aligned (and we know block size is section
aligned), hot remove is not allowed. The problem is this check is late,
and only happens when invalid range has already passed through previous
checks.

We could add check_hotplug_memory_range() to is_mem_section_removable():

is_mem_section_removable(start_pfn, nr_pages)
 if (check_hotplug_memory_range(PFN_PHYS(start_pfn), PFN_PHYS(nr_pages)))
  return false;

I think it should work.

Pavel

> 
>> Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
>> Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
>> Cc: <stable@vger.kernel.org>
>> ---
>>  mm/memory_hotplug.c | 20 +++++++++++---------
>>  1 file changed, 11 insertions(+), 9 deletions(-)
>>
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 9eea6e809a4e..8e20e8fcc3b0 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1229,9 +1229,8 @@ static struct page *next_active_pageblock(struct page *page)
>>  	return page + pageblock_nr_pages;
>>  }
>>  
>> -static bool is_pageblock_removable_nolock(struct page *page)
>> +static bool is_pageblock_removable_nolock(struct page *page, struct zone **zone)
>>  {
>> -	struct zone *zone;
>>  	unsigned long pfn;
>>  
>>  	/*
>> @@ -1241,15 +1240,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
>>  	 * We have to take care about the node as well. If the node is offline
>>  	 * its NODE_DATA will be NULL - see page_zone.
>>  	 */
>> -	if (!node_online(page_to_nid(page)))
>> -		return false;
>> -
>> -	zone = page_zone(page);
>>  	pfn = page_to_pfn(page);
>> -	if (!zone_spans_pfn(zone, pfn))
>> +	if (*zone && !zone_spans_pfn(*zone, pfn))
>>  		return false;
>> +	if (!node_online(page_to_nid(page)))
>> +		return false;
>> +	*zone = page_zone(page);
>>  
>> -	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
>> +	return !has_unmovable_pages(*zone, page, 0, MIGRATE_MOVABLE, true);
>>  }
>>  
>>  /* Checks if this range of memory is likely to be hot-removable. */
>> @@ -1257,10 +1255,11 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
>>  {
>>  	struct page *page = pfn_to_page(start_pfn);
>>  	struct page *end_page = page + nr_pages;
>> +	struct zone *zone = NULL;
>>  
>>  	/* Check the starting page of each pageblock within the range */
>>  	for (; page < end_page; page = next_active_pageblock(page)) {
>> -		if (!is_pageblock_removable_nolock(page))
>> +		if (!is_pageblock_removable_nolock(page, &zone))
>>  			return false;
>>  		cond_resched();
>>  	}
>> @@ -1296,6 +1295,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
>>  				i++;
>>  			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
>>  				continue;
>> +			/* Check if we got outside of the zone */
>> +			if (zone && !zone_spans_pfn(zone, pfn))
>> +				return 0;
>>  			page = pfn_to_page(pfn + i);
>>  			if (zone && page_zone(page) != zone)
>>  				return 0;
>> -- 
>> 2.16.4
>
Michal Hocko Sept. 10, 2018, 1:59 p.m. UTC | #3
On Mon 10-09-18 13:46:45, Pavel Tatashin wrote:
> 
> 
> On 9/10/18 9:17 AM, Michal Hocko wrote:
> > [Cc Pavel]
> > 
> > On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
> >> If memory end is not aligned with the linux memory section boundary, such
> >> a section is only partly initialized. This may lead to VM_BUG_ON due to
> >> uninitialized struct pages access from is_mem_section_removable() or
> >> test_pages_in_a_zone() function.
> >>
> >> Here is one of the panic examples:
> >>  CONFIG_DEBUG_VM_PGFLAGS=y
> >>  kernel parameter mem=3075M
> > 
> > OK, so the last memory section is not full and we have a partial memory
> > block right?
> > 
> >>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
> > 
> > OK, this means that the struct page is not fully initialized. Do you
> > have a specific place which has triggered this assert?
> > 
> >>  ------------[ cut here ]------------
> >>  Call Trace:
> >>  ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
> >>   [<00000000009558ba>] show_mem_removable+0xda/0xe0
> >>   [<00000000009325fc>] dev_attr_show+0x3c/0x80
> >>   [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
> >>   [<00000000003fc4e0>] seq_read+0x208/0x4c8
> >>   [<00000000003cb80e>] __vfs_read+0x46/0x180
> >>   [<00000000003cb9ce>] vfs_read+0x86/0x148
> >>   [<00000000003cc06a>] ksys_read+0x62/0xc0
> >>   [<0000000000c001c0>] system_call+0xdc/0x2d8
> >>
> >> This fix checks if the page lies within the zone boundaries before
> >> accessing the struct page data. The check is added to both functions.
> >> Actually similar check has already been present in
> >> is_pageblock_removable_nolock() function but only after the struct page
> >> is accessed.
> >>
> > 
> > Well, I am afraid this is not the proper solution. We are relying on the
> > full pageblock worth of initialized struct pages at many other place. We
> > used to do that in the past because we have initialized the full
> > section but this has been changed recently. Pavel, do you have any ideas
> > how to deal with this partial mem sections now?
> 
> We have:
> 
> remove_memory()
> 	BUG_ON(check_hotplug_memory_range(start, size))
> 
> That supposed to safely check for this condition: if [start, start +
> size) not block size aligned (and we know block size is section
> aligned), hot remove is not allowed. The problem is this check is late,
> and only happens when invalid range has already passed through previous
> checks.
> 
> We could add check_hotplug_memory_range() to is_mem_section_removable():
> 
> is_mem_section_removable(start_pfn, nr_pages)
>  if (check_hotplug_memory_range(PFN_PHYS(start_pfn), PFN_PHYS(nr_pages)))
>   return false;
> 
> I think it should work.

I do not think we want to sprinkle these tests over all pfn walkers. Can
we simply initialize those uninitialized holes as well and make them
reserved without handing them over to the page allocator? That would be
much more robust approach IMHO.
Pasha Tatashin Sept. 10, 2018, 2:11 p.m. UTC | #4
Hi Michal,

It is tricky, but probably can be done. Either change
memmap_init_zone() or its caller to also cover the ends and starts of
unaligned sections to initialize and reserve pages.

The same thing would also need to be done in deferred_init_memmap() to
cover the deferred init case.

For hotplugged memory we do not need to worry, as that one is always
section aligned.

Do you think this would be a better approach?

Thank you,
Pavel
On Mon, Sep 10, 2018 at 10:00 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 10-09-18 13:46:45, Pavel Tatashin wrote:
> >
> >
> > On 9/10/18 9:17 AM, Michal Hocko wrote:
> > > [Cc Pavel]
> > >
> > > On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
> > >> If memory end is not aligned with the linux memory section boundary, such
> > >> a section is only partly initialized. This may lead to VM_BUG_ON due to
> > >> uninitialized struct pages access from is_mem_section_removable() or
> > >> test_pages_in_a_zone() function.
> > >>
> > >> Here is one of the panic examples:
> > >>  CONFIG_DEBUG_VM_PGFLAGS=y
> > >>  kernel parameter mem=3075M
> > >
> > > OK, so the last memory section is not full and we have a partial memory
> > > block right?
> > >
> > >>  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
> > >
> > > OK, this means that the struct page is not fully initialized. Do you
> > > have a specific place which has triggered this assert?
> > >
> > >>  ------------[ cut here ]------------
> > >>  Call Trace:
> > >>  ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
> > >>   [<00000000009558ba>] show_mem_removable+0xda/0xe0
> > >>   [<00000000009325fc>] dev_attr_show+0x3c/0x80
> > >>   [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
> > >>   [<00000000003fc4e0>] seq_read+0x208/0x4c8
> > >>   [<00000000003cb80e>] __vfs_read+0x46/0x180
> > >>   [<00000000003cb9ce>] vfs_read+0x86/0x148
> > >>   [<00000000003cc06a>] ksys_read+0x62/0xc0
> > >>   [<0000000000c001c0>] system_call+0xdc/0x2d8
> > >>
> > >> This fix checks if the page lies within the zone boundaries before
> > >> accessing the struct page data. The check is added to both functions.
> > >> Actually similar check has already been present in
> > >> is_pageblock_removable_nolock() function but only after the struct page
> > >> is accessed.
> > >>
> > >
> > > Well, I am afraid this is not the proper solution. We are relying on the
> > > full pageblock worth of initialized struct pages at many other place. We
> > > used to do that in the past because we have initialized the full
> > > section but this has been changed recently. Pavel, do you have any ideas
> > > how to deal with this partial mem sections now?
> >
> > We have:
> >
> > remove_memory()
> >       BUG_ON(check_hotplug_memory_range(start, size))
> >
> > That supposed to safely check for this condition: if [start, start +
> > size) not block size aligned (and we know block size is section
> > aligned), hot remove is not allowed. The problem is this check is late,
> > and only happens when invalid range has already passed through previous
> > checks.
> >
> > We could add check_hotplug_memory_range() to is_mem_section_removable():
> >
> > is_mem_section_removable(start_pfn, nr_pages)
> >  if (check_hotplug_memory_range(PFN_PHYS(start_pfn), PFN_PHYS(nr_pages)))
> >   return false;
> >
> > I think it should work.
>
> I do not think we want to sprinkle these tests over all pfn walkers. Can
> we simply initialize those uninitialized holes as well and make them
> reserved without handing them over to the page allocator? That would be
> much more robust approach IMHO.
> --
> Michal Hocko
> SUSE Labs
>
Michal Hocko Sept. 10, 2018, 2:19 p.m. UTC | #5
On Mon 10-09-18 14:11:45, Pavel Tatashin wrote:
> Hi Michal,
> 
> It is tricky, but probably can be done. Either change
> memmap_init_zone() or its caller to also cover the ends and starts of
> unaligned sections to initialize and reserve pages.
> 
> The same thing would also need to be done in deferred_init_memmap() to
> cover the deferred init case.

Well, I am not sure TBH. I have to think about that much more. Maybe it
would be much more simple to make sure that we will never add incomplete
memblocks and simply refuse them during the discovery. At least for now.
Pasha Tatashin Sept. 10, 2018, 2:32 p.m. UTC | #6
On Mon, Sep 10, 2018 at 10:19 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 10-09-18 14:11:45, Pavel Tatashin wrote:
> > Hi Michal,
> >
> > It is tricky, but probably can be done. Either change
> > memmap_init_zone() or its caller to also cover the ends and starts of
> > unaligned sections to initialize and reserve pages.
> >
> > The same thing would also need to be done in deferred_init_memmap() to
> > cover the deferred init case.
>
> Well, I am not sure TBH. I have to think about that much more. Maybe it
> would be much more simple to make sure that we will never add incomplete
> memblocks and simply refuse them during the discovery. At least for now.

On x86 memblocks can be upto 2G on machines with over 64G of RAM.
Also, memory size is way to easy too change via qemu arguments when VM
starts. If we simply disable unaligned trailing memblocks, I am sure
we would get tons of noise of missing memory.

I think, adding check_hotplug_memory_range() would work to fix the
immediate problem. But, we do need to figure out  a better solution.

memblock design is based on archaic assumption that hotplug units are
physical dimms. VMs and hypervisors changed all of that, and we can
have much finer hotplug requests on machines with huge DIMMs. Yet, we
do not want to pollute sysfs with millions of tiny memory devices. I
am not sure what a long term proper solution for this problem should
be, but I see that linux hotplug/hotremove subsystems must be
redesigned based on the new requirements.

Pavel
Michal Hocko Sept. 10, 2018, 2:41 p.m. UTC | #7
On Mon 10-09-18 14:32:16, Pavel Tatashin wrote:
> On Mon, Sep 10, 2018 at 10:19 AM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 10-09-18 14:11:45, Pavel Tatashin wrote:
> > > Hi Michal,
> > >
> > > It is tricky, but probably can be done. Either change
> > > memmap_init_zone() or its caller to also cover the ends and starts of
> > > unaligned sections to initialize and reserve pages.
> > >
> > > The same thing would also need to be done in deferred_init_memmap() to
> > > cover the deferred init case.
> >
> > Well, I am not sure TBH. I have to think about that much more. Maybe it
> > would be much more simple to make sure that we will never add incomplete
> > memblocks and simply refuse them during the discovery. At least for now.
> 
> On x86 memblocks can be upto 2G on machines with over 64G of RAM.

sorry I meant pageblock_nr_pages rather than memblocks.

> Also, memory size is way to easy too change via qemu arguments when VM
> starts. If we simply disable unaligned trailing memblocks, I am sure
> we would get tons of noise of missing memory.
> 
> I think, adding check_hotplug_memory_range() would work to fix the
> immediate problem. But, we do need to figure out  a better solution.
> 
> memblock design is based on archaic assumption that hotplug units are
> physical dimms. VMs and hypervisors changed all of that, and we can
> have much finer hotplug requests on machines with huge DIMMs. Yet, we
> do not want to pollute sysfs with millions of tiny memory devices. I
> am not sure what a long term proper solution for this problem should
> be, but I see that linux hotplug/hotremove subsystems must be
> redesigned based on the new requirements.

Not an easy task though. Anyway, sparse memory modely is highly based on
memory sections so it makes some sense to have hotplug section based as
well. Memblocks as a higher logical unit on top of that is kinda hack.
The userspace API has never been properly thought through I am afraid.
Pasha Tatashin Sept. 10, 2018, 3:26 p.m. UTC | #8
On 9/10/18 10:41 AM, Michal Hocko wrote:
> On Mon 10-09-18 14:32:16, Pavel Tatashin wrote:
>> On Mon, Sep 10, 2018 at 10:19 AM Michal Hocko <mhocko@kernel.org> wrote:
>>>
>>> On Mon 10-09-18 14:11:45, Pavel Tatashin wrote:
>>>> Hi Michal,
>>>>
>>>> It is tricky, but probably can be done. Either change
>>>> memmap_init_zone() or its caller to also cover the ends and starts of
>>>> unaligned sections to initialize and reserve pages.
>>>>
>>>> The same thing would also need to be done in deferred_init_memmap() to
>>>> cover the deferred init case.
>>>
>>> Well, I am not sure TBH. I have to think about that much more. Maybe it
>>> would be much more simple to make sure that we will never add incomplete
>>> memblocks and simply refuse them during the discovery. At least for now.
>>
>> On x86 memblocks can be upto 2G on machines with over 64G of RAM.
> 
> sorry I meant pageblock_nr_pages rather than memblocks.

OK. This sound reasonable, but, to be honest I am not sure how to
achieve this yet, I need to think more about this. In theory, if we have
sparse memory model, it makes sense to enforce memory alignment to
section sizes, sounds a lot safer.

> 
>> Also, memory size is way to easy too change via qemu arguments when VM
>> starts. If we simply disable unaligned trailing memblocks, I am sure
>> we would get tons of noise of missing memory.
>>
>> I think, adding check_hotplug_memory_range() would work to fix the
>> immediate problem. But, we do need to figure out  a better solution.
>>
>> memblock design is based on archaic assumption that hotplug units are
>> physical dimms. VMs and hypervisors changed all of that, and we can
>> have much finer hotplug requests on machines with huge DIMMs. Yet, we
>> do not want to pollute sysfs with millions of tiny memory devices. I
>> am not sure what a long term proper solution for this problem should
>> be, but I see that linux hotplug/hotremove subsystems must be
>> redesigned based on the new requirements.
> 
> Not an easy task though. Anyway, sparse memory modely is highly based on
> memory sections so it makes some sense to have hotplug section based as
> well. Memblocks as a higher logical unit on top of that is kinda hack.
> The userspace API has never been properly thought through I am afraid.

I agree memoryblock is a hack, it fails to do both things it was
designed to do:

1. On bare metal you cannot free a physical dimm of memory using
memoryblock granularity because memory devices do not equal to physical
dimms. Thus, if for some reason a particular dimm must be
remove/replaced, memoryblock does not help us.

2. On machines with hypervisors it fails to provide an adequate
granularity to add/remove memory.

We should define a new user interface where memory can be added/removed
at a finer granularity: sparse section size, but without a memory
devices for each section. We should also provide an optional access to
legacy interface where memory devices are exported but each is of
section size.

So, when legacy interface is enabled, current way would work:

echo offline > /sys/devices/system/memory/memoryXXX/state

And new interface would allow us to do something like this:

echo offline 256M > /sys/devices/system/node/nodeXXX/memory

With optional start address for offline memory.
echo offline [start_pa] size > /sys/devices/system/node/nodeXXX/memory
start_pa and size must be section size aligned (128M).

It would probably be a good discussion for the next MM Summit how to
solve the current memory hotplug interface limitations.

Pavel
Michal Hocko Sept. 11, 2018, 9:16 a.m. UTC | #9
On Mon 10-09-18 15:26:55, Pavel Tatashin wrote:
> 
> 
> On 9/10/18 10:41 AM, Michal Hocko wrote:
> > On Mon 10-09-18 14:32:16, Pavel Tatashin wrote:
> >> On Mon, Sep 10, 2018 at 10:19 AM Michal Hocko <mhocko@kernel.org> wrote:
> >>>
> >>> On Mon 10-09-18 14:11:45, Pavel Tatashin wrote:
> >>>> Hi Michal,
> >>>>
> >>>> It is tricky, but probably can be done. Either change
> >>>> memmap_init_zone() or its caller to also cover the ends and starts of
> >>>> unaligned sections to initialize and reserve pages.
> >>>>
> >>>> The same thing would also need to be done in deferred_init_memmap() to
> >>>> cover the deferred init case.
> >>>
> >>> Well, I am not sure TBH. I have to think about that much more. Maybe it
> >>> would be much more simple to make sure that we will never add incomplete
> >>> memblocks and simply refuse them during the discovery. At least for now.
> >>
> >> On x86 memblocks can be upto 2G on machines with over 64G of RAM.
> > 
> > sorry I meant pageblock_nr_pages rather than memblocks.
> 
> OK. This sound reasonable, but, to be honest I am not sure how to
> achieve this yet, I need to think more about this. In theory, if we have
> sparse memory model, it makes sense to enforce memory alignment to
> section sizes, sounds a lot safer.

Memory hotplug is sparsemem only. You do not have to think about other
memory models fortunately.
 
> >> Also, memory size is way to easy too change via qemu arguments when VM
> >> starts. If we simply disable unaligned trailing memblocks, I am sure
> >> we would get tons of noise of missing memory.
> >>
> >> I think, adding check_hotplug_memory_range() would work to fix the
> >> immediate problem. But, we do need to figure out  a better solution.
> >>
> >> memblock design is based on archaic assumption that hotplug units are
> >> physical dimms. VMs and hypervisors changed all of that, and we can
> >> have much finer hotplug requests on machines with huge DIMMs. Yet, we
> >> do not want to pollute sysfs with millions of tiny memory devices. I
> >> am not sure what a long term proper solution for this problem should
> >> be, but I see that linux hotplug/hotremove subsystems must be
> >> redesigned based on the new requirements.
> > 
> > Not an easy task though. Anyway, sparse memory modely is highly based on
> > memory sections so it makes some sense to have hotplug section based as
> > well. Memblocks as a higher logical unit on top of that is kinda hack.
> > The userspace API has never been properly thought through I am afraid.
> 
> I agree memoryblock is a hack, it fails to do both things it was
> designed to do:
> 
> 1. On bare metal you cannot free a physical dimm of memory using
> memoryblock granularity because memory devices do not equal to physical
> dimms. Thus, if for some reason a particular dimm must be
> remove/replaced, memoryblock does not help us.

agreed

> 2. On machines with hypervisors it fails to provide an adequate
> granularity to add/remove memory.
> 
> We should define a new user interface where memory can be added/removed
> at a finer granularity: sparse section size, but without a memory
> devices for each section. We should also provide an optional access to
> legacy interface where memory devices are exported but each is of
> section size.
> 
> So, when legacy interface is enabled, current way would work:
> 
> echo offline > /sys/devices/system/memory/memoryXXX/state
> 
> And new interface would allow us to do something like this:
> 
> echo offline 256M > /sys/devices/system/node/nodeXXX/memory
> 
> With optional start address for offline memory.
> echo offline [start_pa] size > /sys/devices/system/node/nodeXXX/memory
> start_pa and size must be section size aligned (128M).

I am not sure what is the expected semantic of the version without
start_pa.

> It would probably be a good discussion for the next MM Summit how to
> solve the current memory hotplug interface limitations.

Yes, sounds good to me. In any case let's not pollute this email thread
with this discussion now.
Zaslonko Mikhail Sept. 11, 2018, 2:06 p.m. UTC | #10
On 10.09.2018 15:17, Michal Hocko wrote:
> [Cc Pavel]
>
> On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
>> If memory end is not aligned with the linux memory section boundary, such
>> a section is only partly initialized. This may lead to VM_BUG_ON due to
>> uninitialized struct pages access from is_mem_section_removable() or
>> test_pages_in_a_zone() function.
>>
>> Here is one of the panic examples:
>>   CONFIG_DEBUG_VM_PGFLAGS=y
>>   kernel parameter mem=3075M
> OK, so the last memory section is not full and we have a partial memory
> block right?

Right. In my example above, I define 3075M (3Gig + 3Meg) of base memory 
in the
kernel parameters. As a result we end up with the last memory block 
having only
3 megabytes initialized. The initialization takes place within
memmap_init_zone(unsigned long size, ...) function called from
free_area_init_core() with the size = zone->spanned_pages. Thus, only three
megabytes of the last memory block are initialized (till the end of the zone
Normal). And with the page poisoning introduced by Pavel we fail on such a
memory block processing in memory_hotplug code (no actual memory hotplug
is involved here).

>
>>   page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
> OK, this means that the struct page is not fully initialized. Do you
> have a specific place which has triggered this assert?

This assert is triggered in page_to_nid() function when it is called for
uninitialized page. I found two places where that can happen:
1) is_pageblock_removable_nolock() - direct call
2) test_pages_in_a_zone() - via page_zone() call

>
>>   ------------[ cut here ]------------
>>   Call Trace:
>>   ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
>>    [<00000000009558ba>] show_mem_removable+0xda/0xe0
>>    [<00000000009325fc>] dev_attr_show+0x3c/0x80
>>    [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
>>    [<00000000003fc4e0>] seq_read+0x208/0x4c8
>>    [<00000000003cb80e>] __vfs_read+0x46/0x180
>>    [<00000000003cb9ce>] vfs_read+0x86/0x148
>>    [<00000000003cc06a>] ksys_read+0x62/0xc0
>>    [<0000000000c001c0>] system_call+0xdc/0x2d8
>>
>> This fix checks if the page lies within the zone boundaries before
>> accessing the struct page data. The check is added to both functions.
>> Actually similar check has already been present in
>> is_pageblock_removable_nolock() function but only after the struct page
>> is accessed.
>>
> Well, I am afraid this is not the proper solution. We are relying on the
> full pageblock worth of initialized struct pages at many other place. We
> used to do that in the past because we have initialized the full
> section but this has been changed recently. Pavel, do you have any ideas
> how to deal with this partial mem sections now?

I think this is not related to the recent changes of memory 
initialization. If
you mean deferred init case, the problem exists even without
CONFIG_DEFERRED_STRUCT_PAGE_INIT kernel option.

>> Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
>> Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
>> Cc: <stable@vger.kernel.org>
>> ---
>>   mm/memory_hotplug.c | 20 +++++++++++---------
>>   1 file changed, 11 insertions(+), 9 deletions(-)
>>
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 9eea6e809a4e..8e20e8fcc3b0 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1229,9 +1229,8 @@ static struct page *next_active_pageblock(struct page *page)
>>   	return page + pageblock_nr_pages;
>>   }
>>   
>> -static bool is_pageblock_removable_nolock(struct page *page)
>> +static bool is_pageblock_removable_nolock(struct page *page, struct zone **zone)
>>   {
>> -	struct zone *zone;
>>   	unsigned long pfn;
>>   
>>   	/*
>> @@ -1241,15 +1240,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
>>   	 * We have to take care about the node as well. If the node is offline
>>   	 * its NODE_DATA will be NULL - see page_zone.
>>   	 */
>> -	if (!node_online(page_to_nid(page)))
>> -		return false;
>> -
>> -	zone = page_zone(page);
>>   	pfn = page_to_pfn(page);
>> -	if (!zone_spans_pfn(zone, pfn))
>> +	if (*zone && !zone_spans_pfn(*zone, pfn))
>>   		return false;
>> +	if (!node_online(page_to_nid(page)))
>> +		return false;
>> +	*zone = page_zone(page);
>>   
>> -	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
>> +	return !has_unmovable_pages(*zone, page, 0, MIGRATE_MOVABLE, true);
>>   }
>>   
>>   /* Checks if this range of memory is likely to be hot-removable. */
>> @@ -1257,10 +1255,11 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
>>   {
>>   	struct page *page = pfn_to_page(start_pfn);
>>   	struct page *end_page = page + nr_pages;
>> +	struct zone *zone = NULL;
>>   
>>   	/* Check the starting page of each pageblock within the range */
>>   	for (; page < end_page; page = next_active_pageblock(page)) {
>> -		if (!is_pageblock_removable_nolock(page))
>> +		if (!is_pageblock_removable_nolock(page, &zone))
>>   			return false;
>>   		cond_resched();
>>   	}
>> @@ -1296,6 +1295,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
>>   				i++;
>>   			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
>>   				continue;
>> +			/* Check if we got outside of the zone */
>> +			if (zone && !zone_spans_pfn(zone, pfn))
>> +				return 0;
>>   			page = pfn_to_page(pfn + i);
>>   			if (zone && page_zone(page) != zone)
>>   				return 0;
>> -- 
>> 2.16.4
Zaslonko Mikhail Sept. 11, 2018, 2:08 p.m. UTC | #11
On 10.09.2018 15:46, Pasha Tatashin wrote:
>
> On 9/10/18 9:17 AM, Michal Hocko wrote:
>> [Cc Pavel]
>>
>> On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
>>> If memory end is not aligned with the linux memory section boundary, such
>>> a section is only partly initialized. This may lead to VM_BUG_ON due to
>>> uninitialized struct pages access from is_mem_section_removable() or
>>> test_pages_in_a_zone() function.
>>>
>>> Here is one of the panic examples:
>>>   CONFIG_DEBUG_VM_PGFLAGS=y
>>>   kernel parameter mem=3075M
>> OK, so the last memory section is not full and we have a partial memory
>> block right?
>>
>>>   page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
>> OK, this means that the struct page is not fully initialized. Do you
>> have a specific place which has triggered this assert?
>>
>>>   ------------[ cut here ]------------
>>>   Call Trace:
>>>   ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
>>>    [<00000000009558ba>] show_mem_removable+0xda/0xe0
>>>    [<00000000009325fc>] dev_attr_show+0x3c/0x80
>>>    [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
>>>    [<00000000003fc4e0>] seq_read+0x208/0x4c8
>>>    [<00000000003cb80e>] __vfs_read+0x46/0x180
>>>    [<00000000003cb9ce>] vfs_read+0x86/0x148
>>>    [<00000000003cc06a>] ksys_read+0x62/0xc0
>>>    [<0000000000c001c0>] system_call+0xdc/0x2d8
>>>
>>> This fix checks if the page lies within the zone boundaries before
>>> accessing the struct page data. The check is added to both functions.
>>> Actually similar check has already been present in
>>> is_pageblock_removable_nolock() function but only after the struct page
>>> is accessed.
>>>
>> Well, I am afraid this is not the proper solution. We are relying on the
>> full pageblock worth of initialized struct pages at many other place. We
>> used to do that in the past because we have initialized the full
>> section but this has been changed recently. Pavel, do you have any ideas
>> how to deal with this partial mem sections now?
> We have:
>
> remove_memory()
> 	BUG_ON(check_hotplug_memory_range(start, size))
>
> That supposed to safely check for this condition: if [start, start +
> size) not block size aligned (and we know block size is section
> aligned), hot remove is not allowed. The problem is this check is late,
> and only happens when invalid range has already passed through previous
> checks.
>
> We could add check_hotplug_memory_range() to is_mem_section_removable():
>
> is_mem_section_removable(start_pfn, nr_pages)
>   if (check_hotplug_memory_range(PFN_PHYS(start_pfn), PFN_PHYS(nr_pages)))
>    return false;
>
> I think it should work.

I don't think so since is_mem_section_removable() is called for for the 
entire
section. Thus [start_pfn, start_pfn + nr_pages) is always memory block 
aligned.

>
> Pavel
>
>>> Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
>>> Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
>>> Cc: <stable@vger.kernel.org>
>>> ---
>>>   mm/memory_hotplug.c | 20 +++++++++++---------
>>>   1 file changed, 11 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>>> index 9eea6e809a4e..8e20e8fcc3b0 100644
>>> --- a/mm/memory_hotplug.c
>>> +++ b/mm/memory_hotplug.c
>>> @@ -1229,9 +1229,8 @@ static struct page *next_active_pageblock(struct page *page)
>>>   	return page + pageblock_nr_pages;
>>>   }
>>>   
>>> -static bool is_pageblock_removable_nolock(struct page *page)
>>> +static bool is_pageblock_removable_nolock(struct page *page, struct zone **zone)
>>>   {
>>> -	struct zone *zone;
>>>   	unsigned long pfn;
>>>   
>>>   	/*
>>> @@ -1241,15 +1240,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
>>>   	 * We have to take care about the node as well. If the node is offline
>>>   	 * its NODE_DATA will be NULL - see page_zone.
>>>   	 */
>>> -	if (!node_online(page_to_nid(page)))
>>> -		return false;
>>> -
>>> -	zone = page_zone(page);
>>>   	pfn = page_to_pfn(page);
>>> -	if (!zone_spans_pfn(zone, pfn))
>>> +	if (*zone && !zone_spans_pfn(*zone, pfn))
>>>   		return false;
>>> +	if (!node_online(page_to_nid(page)))
>>> +		return false;
>>> +	*zone = page_zone(page);
>>>   
>>> -	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
>>> +	return !has_unmovable_pages(*zone, page, 0, MIGRATE_MOVABLE, true);
>>>   }
>>>   
>>>   /* Checks if this range of memory is likely to be hot-removable. */
>>> @@ -1257,10 +1255,11 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
>>>   {
>>>   	struct page *page = pfn_to_page(start_pfn);
>>>   	struct page *end_page = page + nr_pages;
>>> +	struct zone *zone = NULL;
>>>   
>>>   	/* Check the starting page of each pageblock within the range */
>>>   	for (; page < end_page; page = next_active_pageblock(page)) {
>>> -		if (!is_pageblock_removable_nolock(page))
>>> +		if (!is_pageblock_removable_nolock(page, &zone))
>>>   			return false;
>>>   		cond_resched();
>>>   	}
>>> @@ -1296,6 +1295,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
>>>   				i++;
>>>   			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
>>>   				continue;
>>> +			/* Check if we got outside of the zone */
>>> +			if (zone && !zone_spans_pfn(zone, pfn))
>>> +				return 0;
>>>   			page = pfn_to_page(pfn + i);
>>>   			if (zone && page_zone(page) != zone)
>>>   				return 0;
>>> -- 
>>> 2.16.4
Michal Hocko Sept. 12, 2018, 12:21 p.m. UTC | #12
On Tue 11-09-18 16:06:23, Zaslonko Mikhail wrote:
[...]
> > Well, I am afraid this is not the proper solution. We are relying on the
> > full pageblock worth of initialized struct pages at many other place. We
> > used to do that in the past because we have initialized the full
> > section but this has been changed recently. Pavel, do you have any ideas
> > how to deal with this partial mem sections now?
> 
> I think this is not related to the recent changes of memory initialization.
> If
> you mean deferred init case, the problem exists even without
> CONFIG_DEFERRED_STRUCT_PAGE_INIT kernel option.

This is more about struct page initialization (which doesn't clear
whole) memmap area and as such it stays unitialized. So you are right
this is a much older issue we just happened to not notice without
explicit memmap poisoning.
Gerald Schaefer Sept. 12, 2018, 1:03 p.m. UTC | #13
On Mon, 10 Sep 2018 15:17:54 +0200
Michal Hocko <mhocko@kernel.org> wrote:

> [Cc Pavel]
> 
> On Mon 10-09-18 14:35:27, Mikhail Zaslonko wrote:
> > If memory end is not aligned with the linux memory section boundary, such
> > a section is only partly initialized. This may lead to VM_BUG_ON due to
> > uninitialized struct pages access from is_mem_section_removable() or
> > test_pages_in_a_zone() function.
> > 
> > Here is one of the panic examples:
> >  CONFIG_DEBUG_VM_PGFLAGS=y
> >  kernel parameter mem=3075M  
> 
> OK, so the last memory section is not full and we have a partial memory
> block right?
> 
> >  page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))  
> 
> OK, this means that the struct page is not fully initialized. Do you
> have a specific place which has triggered this assert?
> 
> >  ------------[ cut here ]------------
> >  Call Trace:
> >  ([<000000000039b8a4>] is_mem_section_removable+0xcc/0x1c0)
> >   [<00000000009558ba>] show_mem_removable+0xda/0xe0
> >   [<00000000009325fc>] dev_attr_show+0x3c/0x80
> >   [<000000000047e7ea>] sysfs_kf_seq_show+0xda/0x160
> >   [<00000000003fc4e0>] seq_read+0x208/0x4c8
> >   [<00000000003cb80e>] __vfs_read+0x46/0x180
> >   [<00000000003cb9ce>] vfs_read+0x86/0x148
> >   [<00000000003cc06a>] ksys_read+0x62/0xc0
> >   [<0000000000c001c0>] system_call+0xdc/0x2d8
> > 
> > This fix checks if the page lies within the zone boundaries before
> > accessing the struct page data. The check is added to both functions.
> > Actually similar check has already been present in
> > is_pageblock_removable_nolock() function but only after the struct page
> > is accessed.
> >   
> 
> Well, I am afraid this is not the proper solution. We are relying on the
> full pageblock worth of initialized struct pages at many other place. We
> used to do that in the past because we have initialized the full
> section but this has been changed recently. Pavel, do you have any ideas
> how to deal with this partial mem sections now?

This is not about partly initialized pageblocks, it is about partly
initialized struct pages for memory (hotplug) blocks. And this also
seems to "work as designed", i.e. memmap_init_zone() will stop at
zone->spanned_pages, and not initialize the full memory block if
you specify a not-memory-block-aligned mem= parameter.

"Normal" memory management code should never get in touch with the
uninitialized part, only the 2 memory hotplug sysfs handlers for
"valid_zones" and "removable" will iterate over a complete memory block.

So it does seem rather straight-forward to simply fix those 2 functions,
and not let them go beyond zone->spanned_pages, which is what Mikhails patch
would do. What exactly is wrong with that approach, and how would you rather
have it fixed? A patch that changes memmap_init_zone() to initialize all
struct pages of the last memory block, even beyond zone->spanned_pages?
Could this be done w/o side-effects?

If you look at test_pages_in_a_zone(), there is already some logic that
obviously assumes that at least the first page of the memory block is
initialized, and then while iterating over the rest, a check for
zone_spans_pfn() can easily be added. Mikhail applied the same logic
to is_mem_section_removable(), and his patch does fix the panic (or access
to uninitialized struct pages w/o DEBUG_VM poisoning).

BTW, those sysfs attributes are world-readable, so anyone can trigger
the panic by simply reading them, or just run lsmem (also available for
x86 since util-linux 2.32). OK, you need a special not-memory-block-aligned
mem= parameter and DEBUG_VM for poison check, but w/o DEBUG_VM you would
still access uninitialized struct pages. This sounds very wrong, and I
think it really should be fixed.

> 
> > Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
> > Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
> > Cc: <stable@vger.kernel.org>
> > ---
> >  mm/memory_hotplug.c | 20 +++++++++++---------
> >  1 file changed, 11 insertions(+), 9 deletions(-)
> > 
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index 9eea6e809a4e..8e20e8fcc3b0 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -1229,9 +1229,8 @@ static struct page *next_active_pageblock(struct page *page)
> >  	return page + pageblock_nr_pages;
> >  }
> >  
> > -static bool is_pageblock_removable_nolock(struct page *page)
> > +static bool is_pageblock_removable_nolock(struct page *page, struct zone **zone)
> >  {
> > -	struct zone *zone;
> >  	unsigned long pfn;
> >  
> >  	/*
> > @@ -1241,15 +1240,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
> >  	 * We have to take care about the node as well. If the node is offline
> >  	 * its NODE_DATA will be NULL - see page_zone.
> >  	 */
> > -	if (!node_online(page_to_nid(page)))
> > -		return false;
> > -
> > -	zone = page_zone(page);
> >  	pfn = page_to_pfn(page);
> > -	if (!zone_spans_pfn(zone, pfn))
> > +	if (*zone && !zone_spans_pfn(*zone, pfn))
> >  		return false;
> > +	if (!node_online(page_to_nid(page)))
> > +		return false;
> > +	*zone = page_zone(page);
> >  
> > -	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
> > +	return !has_unmovable_pages(*zone, page, 0, MIGRATE_MOVABLE, true);
> >  }
> >  
> >  /* Checks if this range of memory is likely to be hot-removable. */
> > @@ -1257,10 +1255,11 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
> >  {
> >  	struct page *page = pfn_to_page(start_pfn);
> >  	struct page *end_page = page + nr_pages;
> > +	struct zone *zone = NULL;
> >  
> >  	/* Check the starting page of each pageblock within the range */
> >  	for (; page < end_page; page = next_active_pageblock(page)) {
> > -		if (!is_pageblock_removable_nolock(page))
> > +		if (!is_pageblock_removable_nolock(page, &zone))
> >  			return false;
> >  		cond_resched();
> >  	}
> > @@ -1296,6 +1295,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
> >  				i++;
> >  			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
> >  				continue;
> > +			/* Check if we got outside of the zone */
> > +			if (zone && !zone_spans_pfn(zone, pfn))
> > +				return 0;
> >  			page = pfn_to_page(pfn + i);
> >  			if (zone && page_zone(page) != zone)
> >  				return 0;
> > -- 
> > 2.16.4  
>
Michal Hocko Sept. 12, 2018, 1:39 p.m. UTC | #14
On Wed 12-09-18 15:03:56, Gerald Schaefer wrote:
[...]
> BTW, those sysfs attributes are world-readable, so anyone can trigger
> the panic by simply reading them, or just run lsmem (also available for
> x86 since util-linux 2.32). OK, you need a special not-memory-block-aligned
> mem= parameter and DEBUG_VM for poison check, but w/o DEBUG_VM you would
> still access uninitialized struct pages. This sounds very wrong, and I
> think it really should be fixed.

Ohh, absolutely. Nobody is questioning that. The thing is that the
code has been likely always broken. We just haven't noticed because
those unitialized parts where zeroed previously. Now that the implicit
zeroying is gone it is just visible.

All that I am arguing is that there are many places which assume
pageblocks to be fully initialized and plugging one place that blows up
at the time is just whack a mole. We need to address this much earlier.
E.g. by allowing only full pageblocks when adding a memory range.
Gerald Schaefer Sept. 12, 2018, 2:27 p.m. UTC | #15
On Wed, 12 Sep 2018 15:39:33 +0200
Michal Hocko <mhocko@kernel.org> wrote:

> On Wed 12-09-18 15:03:56, Gerald Schaefer wrote:
> [...]
> > BTW, those sysfs attributes are world-readable, so anyone can trigger
> > the panic by simply reading them, or just run lsmem (also available for
> > x86 since util-linux 2.32). OK, you need a special not-memory-block-aligned
> > mem= parameter and DEBUG_VM for poison check, but w/o DEBUG_VM you would
> > still access uninitialized struct pages. This sounds very wrong, and I
> > think it really should be fixed.  
> 
> Ohh, absolutely. Nobody is questioning that. The thing is that the
> code has been likely always broken. We just haven't noticed because
> those unitialized parts where zeroed previously. Now that the implicit
> zeroying is gone it is just visible.
> 
> All that I am arguing is that there are many places which assume
> pageblocks to be fully initialized and plugging one place that blows up
> at the time is just whack a mole. We need to address this much earlier.
> E.g. by allowing only full pageblocks when adding a memory range.

Just to make sure we are talking about the same thing: when you say
"pageblocks", do you mean the MAX_ORDER_NR_PAGES / pageblock_nr_pages
unit of pages, or do you mean the memory (hotplug) block unit?

I do not see any issue here with MAX_ORDER_NR_PAGES / pageblock_nr_pages
pageblocks, and if there was such an issue, of course you are right that
this would affect many places. If there was such an issue, I would also
assume that we would see the new page poison warning in many other places.

The bug that Mikhails patch would fix only affects code that operates
on / iterates through memory (hotplug) blocks, and that does not happen
in many places, only in the two functions that his patch fixes.

When you say "address this much earlier", do you mean changing the way
that free_area_init_core()/memmap_init() initialize struct pages, i.e.
have them not use zone->spanned_pages as limit, but rather align that
up to the memory block (not pageblock) boundary?
Gerald Schaefer Sept. 12, 2018, 2:28 p.m. UTC | #16
On Mon, 10 Sep 2018 15:26:55 +0000
Pasha Tatashin <Pavel.Tatashin@microsoft.com> wrote:

> 
> I agree memoryblock is a hack, it fails to do both things it was
> designed to do:
> 
> 1. On bare metal you cannot free a physical dimm of memory using
> memoryblock granularity because memory devices do not equal to physical
> dimms. Thus, if for some reason a particular dimm must be
> remove/replaced, memoryblock does not help us.
> 
> 2. On machines with hypervisors it fails to provide an adequate
> granularity to add/remove memory.
> 
> We should define a new user interface where memory can be added/removed
> at a finer granularity: sparse section size, but without a memory
> devices for each section. We should also provide an optional access to
> legacy interface where memory devices are exported but each is of
> section size.
> 
> So, when legacy interface is enabled, current way would work:
> 
> echo offline > /sys/devices/system/memory/memoryXXX/state
> 
> And new interface would allow us to do something like this:
> 
> echo offline 256M > /sys/devices/system/node/nodeXXX/memory
> 
> With optional start address for offline memory.
> echo offline [start_pa] size > /sys/devices/system/node/nodeXXX/memory
> start_pa and size must be section size aligned (128M).
> 
> It would probably be a good discussion for the next MM Summit how to
> solve the current memory hotplug interface limitations.

Please keep lsmem/chmem from util-linux in mind, when changing the
memory hotplug user interface.
Pasha Tatashin Sept. 12, 2018, 2:40 p.m. UTC | #17
On 9/12/18 10:27 AM, Gerald Schaefer wrote:
> On Wed, 12 Sep 2018 15:39:33 +0200
> Michal Hocko <mhocko@kernel.org> wrote:
> 
>> On Wed 12-09-18 15:03:56, Gerald Schaefer wrote:
>> [...]
>>> BTW, those sysfs attributes are world-readable, so anyone can trigger
>>> the panic by simply reading them, or just run lsmem (also available for
>>> x86 since util-linux 2.32). OK, you need a special not-memory-block-aligned
>>> mem= parameter and DEBUG_VM for poison check, but w/o DEBUG_VM you would
>>> still access uninitialized struct pages. This sounds very wrong, and I
>>> think it really should be fixed.  
>>
>> Ohh, absolutely. Nobody is questioning that. The thing is that the
>> code has been likely always broken. We just haven't noticed because
>> those unitialized parts where zeroed previously. Now that the implicit
>> zeroying is gone it is just visible.
>>
>> All that I am arguing is that there are many places which assume
>> pageblocks to be fully initialized and plugging one place that blows up
>> at the time is just whack a mole. We need to address this much earlier.
>> E.g. by allowing only full pageblocks when adding a memory range.
> 
> Just to make sure we are talking about the same thing: when you say
> "pageblocks", do you mean the MAX_ORDER_NR_PAGES / pageblock_nr_pages
> unit of pages, or do you mean the memory (hotplug) block unit?

From early discussion, it was about pageblock_nr_pages not about
memory_block_size_bytes

> 
> I do not see any issue here with MAX_ORDER_NR_PAGES / pageblock_nr_pages
> pageblocks, and if there was such an issue, of course you are right that
> this would affect many places. If there was such an issue, I would also
> assume that we would see the new page poison warning in many other places.
> 
> The bug that Mikhails patch would fix only affects code that operates
> on / iterates through memory (hotplug) blocks, and that does not happen
> in many places, only in the two functions that his patch fixes.

Just to be clear, so memory is pageblock_nr_pages aligned, yet
memory_block are larger and panic is still triggered?

I ask, because 3075M is not 128M aligned.

> 
> When you say "address this much earlier", do you mean changing the way
> that free_area_init_core()/memmap_init() initialize struct pages, i.e.
> have them not use zone->spanned_pages as limit, but rather align that
> up to the memory block (not pageblock) boundary?
> 

This was my initial proposal, to fix memmap_init() and initialize struct
pages beyond the "end", and before the "start" to cover the whole
section. But, I think Michal suggested (and he might correct me) to
simply ignore unaligned memory to section memory much earlier: so
anything that does not align to sparse order is not added at all to the
system.

I think Michal's proposal would simplify and strengthen memory mapping
overall.

Pavel
Gerald Schaefer Sept. 12, 2018, 3:51 p.m. UTC | #18
On Wed, 12 Sep 2018 14:40:18 +0000
Pasha Tatashin <Pavel.Tatashin@microsoft.com> wrote:

> On 9/12/18 10:27 AM, Gerald Schaefer wrote:
> > On Wed, 12 Sep 2018 15:39:33 +0200
> > Michal Hocko <mhocko@kernel.org> wrote:
> >   
> >> On Wed 12-09-18 15:03:56, Gerald Schaefer wrote:
> >> [...]  
> >>> BTW, those sysfs attributes are world-readable, so anyone can trigger
> >>> the panic by simply reading them, or just run lsmem (also available for
> >>> x86 since util-linux 2.32). OK, you need a special not-memory-block-aligned
> >>> mem= parameter and DEBUG_VM for poison check, but w/o DEBUG_VM you would
> >>> still access uninitialized struct pages. This sounds very wrong, and I
> >>> think it really should be fixed.    
> >>
> >> Ohh, absolutely. Nobody is questioning that. The thing is that the
> >> code has been likely always broken. We just haven't noticed because
> >> those unitialized parts where zeroed previously. Now that the implicit
> >> zeroying is gone it is just visible.
> >>
> >> All that I am arguing is that there are many places which assume
> >> pageblocks to be fully initialized and plugging one place that blows up
> >> at the time is just whack a mole. We need to address this much earlier.
> >> E.g. by allowing only full pageblocks when adding a memory range.  
> > 
> > Just to make sure we are talking about the same thing: when you say
> > "pageblocks", do you mean the MAX_ORDER_NR_PAGES / pageblock_nr_pages
> > unit of pages, or do you mean the memory (hotplug) block unit?  
> 
> From early discussion, it was about pageblock_nr_pages not about
> memory_block_size_bytes
> 
> > 
> > I do not see any issue here with MAX_ORDER_NR_PAGES / pageblock_nr_pages
> > pageblocks, and if there was such an issue, of course you are right that
> > this would affect many places. If there was such an issue, I would also
> > assume that we would see the new page poison warning in many other places.
> > 
> > The bug that Mikhails patch would fix only affects code that operates
> > on / iterates through memory (hotplug) blocks, and that does not happen
> > in many places, only in the two functions that his patch fixes.  
> 
> Just to be clear, so memory is pageblock_nr_pages aligned, yet
> memory_block are larger and panic is still triggered?
> 
> I ask, because 3075M is not 128M aligned.

Correct, 3075M is pageblock aligned (at least on s390), but not memory
block aligned (256 MB on s390). And the "not memory block aligned" is the
reason for the panic, because at least the two memory hotplug functions
seem to rely on completely initialized struct pages for a memory block.
In this scenario we don't have any partly initialized pageblocks.

While thinking about this, with mem= it may actually be possible to also
create a not-pageblock-aligned scenario, e.g. with mem=2097148K. I didn't
try this and I thought that at least pageblock-alignment would always be
present, but from a quick glance at the mem= parsing it should actually
be possible to also create such a scenario. Then we really would have
partly initialized pageblocks, and maybe other problems would occur.

> 
> > 
> > When you say "address this much earlier", do you mean changing the way
> > that free_area_init_core()/memmap_init() initialize struct pages, i.e.
> > have them not use zone->spanned_pages as limit, but rather align that
> > up to the memory block (not pageblock) boundary?
> >   
> 
> This was my initial proposal, to fix memmap_init() and initialize struct
> pages beyond the "end", and before the "start" to cover the whole
> section. But, I think Michal suggested (and he might correct me) to
> simply ignore unaligned memory to section memory much earlier: so
> anything that does not align to sparse order is not added at all to the
> system.
> 
> I think Michal's proposal would simplify and strengthen memory mapping
> overall.

Of course it would be better to fix this in one place by providing
proper alignment, but to what unit, pageblock, section, memory block?
I was just confused by the pageblock discussion, because in the current
scenario we do not have any pageblock issues, and pageblock alignment
would also not help here. section alignment probably would, even though a
memory block can contain multiple sections, at least the memory hotplug
valid_zones/removable sysfs handlers seem to check for present sections
first, before accessing the struct pages.
Zaslonko Mikhail Oct. 24, 2018, 7:28 p.m. UTC | #19
Hello,

I hope it's still possible to revive this thread. Please find my comments below.

On 12.09.2018 16:40, Pasha Tatashin wrote:
> 
> 
> On 9/12/18 10:27 AM, Gerald Schaefer wrote:
>> On Wed, 12 Sep 2018 15:39:33 +0200
>> Michal Hocko <mhocko@kernel.org> wrote:
>>
>>> On Wed 12-09-18 15:03:56, Gerald Schaefer wrote:
>>> [...]
>>>> BTW, those sysfs attributes are world-readable, so anyone can trigger
>>>> the panic by simply reading them, or just run lsmem (also available for
>>>> x86 since util-linux 2.32). OK, you need a special not-memory-block-aligned
>>>> mem= parameter and DEBUG_VM for poison check, but w/o DEBUG_VM you would
>>>> still access uninitialized struct pages. This sounds very wrong, and I
>>>> think it really should be fixed.
>>>
>>> Ohh, absolutely. Nobody is questioning that. The thing is that the
>>> code has been likely always broken. We just haven't noticed because
>>> those unitialized parts where zeroed previously. Now that the implicit
>>> zeroying is gone it is just visible.
>>>
>>> All that I am arguing is that there are many places which assume
>>> pageblocks to be fully initialized and plugging one place that blows up
>>> at the time is just whack a mole. We need to address this much earlier.
>>> E.g. by allowing only full pageblocks when adding a memory range.
>>
>> Just to make sure we are talking about the same thing: when you say
>> "pageblocks", do you mean the MAX_ORDER_NR_PAGES / pageblock_nr_pages
>> unit of pages, or do you mean the memory (hotplug) block unit?
> 
>  From early discussion, it was about pageblock_nr_pages not about
> memory_block_size_bytes
> 
>>
>> I do not see any issue here with MAX_ORDER_NR_PAGES / pageblock_nr_pages
>> pageblocks, and if there was such an issue, of course you are right that
>> this would affect many places. If there was such an issue, I would also
>> assume that we would see the new page poison warning in many other places.
>>
>> The bug that Mikhails patch would fix only affects code that operates
>> on / iterates through memory (hotplug) blocks, and that does not happen
>> in many places, only in the two functions that his patch fixes.
> 
> Just to be clear, so memory is pageblock_nr_pages aligned, yet
> memory_block are larger and panic is still triggered?
> 
> I ask, because 3075M is not 128M aligned.
> 
>>
>> When you say "address this much earlier", do you mean changing the way
>> that free_area_init_core()/memmap_init() initialize struct pages, i.e.
>> have them not use zone->spanned_pages as limit, but rather align that
>> up to the memory block (not pageblock) boundary?
>>
> 
> This was my initial proposal, to fix memmap_init() and initialize struct
> pages beyond the "end", and before the "start" to cover the whole
> section. But, I think Michal suggested (and he might correct me) to
> simply ignore unaligned memory to section memory much earlier: so
> anything that does not align to sparse order is not added at all to the
> system.
> 

I tried both approaches but each of them has issues.

First I tried to ignore unaligned memory early by adjusting memory_end value. 
But the thing is that kernel mem parameter parsing and memory_end calculation 
take place in the architecture code and adjusting it afterwards in common code 
might be too late in my view. Also with this approach we might lose the memory 
up to the entire section(256Mb on s390) just because of unfortunate alignment.

Another approach was "to fix memmap_init() and initialize struct pages beyond 
the end". Since struct pages are allocated section-wise we can try to
round the size parameter passed to the memmap_init() function up to the 
section boundary thus forcing the mapping initialization for the entire 
section. But then it leads to another VM_BUG_ON panic due to zone_spans_pfn() 
sanity check triggered for the first page of each page block from 
set_pageblock_migratetype() function.
     page dumped because: VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn))
      Call Trace: 

      ([<00000000003013f8>] set_pfnblock_flags_mask+0xe8/0x140) 

       [<00000000003014aa>] set_pageblock_migratetype+0x5a/0x70 

       [<0000000000bef706>] memmap_init_zone+0x25e/0x2e0 

       [<00000000010fc3d8>] free_area_init_node+0x530/0x558 

       [<00000000010fcf02>] free_area_init_nodes+0x81a/0x8f0 

       [<00000000010e7fdc>] paging_init+0x124/0x130 

       [<00000000010e4dfa>] setup_arch+0xbf2/0xcc8 

       [<00000000010de9e6>] start_kernel+0x7e/0x588 

       [<000000000010007c>] startup_continue+0x7c/0x300 

      INFO: lockdep is turned off. 

      Last Breaking-Event-Address: 

       [<00000000003013f8>] set_pfnblock_flags_mask+0xe8/0x140
We might ignore this check for the struct pages beyond the "end" but I'm not 
sure about further implications.
Why don't we stay for now with my original proposal fixing specific functions 
for memory hotplug sysfs handlers. Please, tell me what you think.

Thanks,
Mikhail Zaslonko
diff mbox series

Patch

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9eea6e809a4e..8e20e8fcc3b0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1229,9 +1229,8 @@  static struct page *next_active_pageblock(struct page *page)
 	return page + pageblock_nr_pages;
 }
 
-static bool is_pageblock_removable_nolock(struct page *page)
+static bool is_pageblock_removable_nolock(struct page *page, struct zone **zone)
 {
-	struct zone *zone;
 	unsigned long pfn;
 
 	/*
@@ -1241,15 +1240,14 @@  static bool is_pageblock_removable_nolock(struct page *page)
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
-	if (!node_online(page_to_nid(page)))
-		return false;
-
-	zone = page_zone(page);
 	pfn = page_to_pfn(page);
-	if (!zone_spans_pfn(zone, pfn))
+	if (*zone && !zone_spans_pfn(*zone, pfn))
 		return false;
+	if (!node_online(page_to_nid(page)))
+		return false;
+	*zone = page_zone(page);
 
-	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+	return !has_unmovable_pages(*zone, page, 0, MIGRATE_MOVABLE, true);
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
@@ -1257,10 +1255,11 @@  bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct page *page = pfn_to_page(start_pfn);
 	struct page *end_page = page + nr_pages;
+	struct zone *zone = NULL;
 
 	/* Check the starting page of each pageblock within the range */
 	for (; page < end_page; page = next_active_pageblock(page)) {
-		if (!is_pageblock_removable_nolock(page))
+		if (!is_pageblock_removable_nolock(page, &zone))
 			return false;
 		cond_resched();
 	}
@@ -1296,6 +1295,9 @@  int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
 				i++;
 			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
 				continue;
+			/* Check if we got outside of the zone */
+			if (zone && !zone_spans_pfn(zone, pfn))
+				return 0;
 			page = pfn_to_page(pfn + i);
 			if (zone && page_zone(page) != zone)
 				return 0;