[2/2] mm,memory_hotplug: Fix shrink_{zone,node}_span
diff mbox series

Message ID 20190715081549.32577-3-osalvador@suse.de
State New
Headers show
Series
  • Fixes for sub-section hotplug
Related show

Commit Message

Oscar Salvador July 15, 2019, 8:15 a.m. UTC
Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION granularity.
The problem is that deactivation of the section occurs later on in
sparse_remove_section, so pfn_valid()->pfn_section_valid() will always return
true before we deactivate the {sub}section.

I spotted this during hotplug hotremove tests, there I always saw that
spanned_pages was, at least, left with PAGES_PER_SECTION, even if we
removed all memory linked to that zone.

Fix this by decoupling section_deactivate from sparse_remove_section, and
re-order the function calls.

Now, __remove_section will:

1) deactivate section
2) shrink {zone,node}'s pages
3) remove section

[1] https://patchwork.kernel.org/patch/11003467/

Fixes: mmotm ("mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal")
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 include/linux/memory_hotplug.h |  7 ++--
 mm/memory_hotplug.c            |  6 +++-
 mm/sparse.c                    | 77 +++++++++++++++++++++++++++++-------------
 3 files changed, 62 insertions(+), 28 deletions(-)

Comments

Aneesh Kumar K.V July 15, 2019, 4:11 p.m. UTC | #1
Oscar Salvador <osalvador@suse.de> writes:

> Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION granularity.
> The problem is that deactivation of the section occurs later on in
> sparse_remove_section, so pfn_valid()->pfn_section_valid() will always return
> true before we deactivate the {sub}section.

Can you explain this more? The patch doesn't update section_mem_map
update sequence. So what changed? What is the problem in finding
pfn_valid() return true there?

>
> I spotted this during hotplug hotremove tests, there I always saw that
> spanned_pages was, at least, left with PAGES_PER_SECTION, even if we
> removed all memory linked to that zone.
>
> Fix this by decoupling section_deactivate from sparse_remove_section, and
> re-order the function calls.
>
> Now, __remove_section will:
>
> 1) deactivate section
> 2) shrink {zone,node}'s pages
> 3) remove section
>
> [1] https://patchwork.kernel.org/patch/11003467/
>
> Fixes: mmotm ("mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal")
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>  include/linux/memory_hotplug.h |  7 ++--
>  mm/memory_hotplug.c            |  6 +++-
>  mm/sparse.c                    | 77 +++++++++++++++++++++++++++++-------------
>  3 files changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index f46ea71b4ffd..d2eb917aad5f 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -348,9 +348,10 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
>  extern bool is_memblock_offlined(struct memory_block *mem);
>  extern int sparse_add_section(int nid, unsigned long pfn,
>  		unsigned long nr_pages, struct vmem_altmap *altmap);
> -extern void sparse_remove_section(struct mem_section *ms,
> -		unsigned long pfn, unsigned long nr_pages,
> -		unsigned long map_offset, struct vmem_altmap *altmap);
> +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages);
> +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
> +                           unsigned long map_offset, struct vmem_altmap *altmap,
> +                           int section_empty);
>  extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
>  					  unsigned long pnum);
>  extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index b9ba5b85f9f7..03d535eee60d 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -517,12 +517,16 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
>  		struct vmem_altmap *altmap)
>  {
>  	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
> +	int ret;
>  
>  	if (WARN_ON_ONCE(!valid_section(ms)))
>  		return;
>  
> +	ret = sparse_deactivate_section(pfn, nr_pages);
>  	__remove_zone(zone, pfn, nr_pages);
> -	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
> +	if (ret >= 0)
> +		sparse_remove_section(pfn, nr_pages, map_offset, altmap,
> +				      ret);
>  }
>  
>  /**
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 1e224149aab6..d4953ee1d087 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -732,16 +732,47 @@ static void free_map_bootmem(struct page *memmap)
>  }
>  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
>  
> -static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
> -		struct vmem_altmap *altmap)
> +static void section_remove(unsigned long pfn, unsigned long nr_pages,
> +			   struct vmem_altmap *altmap, int section_empty)
> +{
> +	struct mem_section *ms = __pfn_to_section(pfn);
> +	bool section_early = early_section(ms);
> +	struct page *memmap = NULL;
> +
> +	if (section_empty) {
> +		unsigned long section_nr = pfn_to_section_nr(pfn);
> +
> +		if (!section_early) {
> +			kfree(ms->usage);
> +			ms->usage = NULL;
> +		}
> +		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
> +		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
> +	}
> +
> +        if (section_early && memmap)
> +		free_map_bootmem(memmap);
> +        else
> +		depopulate_section_memmap(pfn, nr_pages, altmap);
> +}
> +
> +/**
> + * section_deactivate: Deactivate a {sub}section.
> + *
> + * Return:
> + * * -1         - {sub}section has already been deactivated.
> + * * 0          - Section is not empty
> + * * 1          - Section is empty
> + */
> +
> +static int section_deactivate(unsigned long pfn, unsigned long nr_pages)
>  {
>  	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
>  	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
>  	struct mem_section *ms = __pfn_to_section(pfn);
> -	bool section_is_early = early_section(ms);
> -	struct page *memmap = NULL;
>  	unsigned long *subsection_map = ms->usage
>  		? &ms->usage->subsection_map[0] : NULL;
> +	int section_empty = 0;
>  
>  	subsection_mask_set(map, pfn, nr_pages);
>  	if (subsection_map)
> @@ -750,7 +781,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>  	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
>  				"section already deactivated (%#lx + %ld)\n",
>  				pfn, nr_pages))
> -		return;
> +		return -1;
>  
>  	/*
>  	 * There are 3 cases to handle across two configurations
> @@ -770,21 +801,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>  	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
>  	 */
>  	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
> -	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
> -		unsigned long section_nr = pfn_to_section_nr(pfn);
> -
> -		if (!section_is_early) {
> -			kfree(ms->usage);
> -			ms->usage = NULL;
> -		}
> -		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
> -		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
> -	}
> +	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION))
> +		section_empty = 1;
>  
> -	if (section_is_early && memmap)
> -		free_map_bootmem(memmap);
> -	else
> -		depopulate_section_memmap(pfn, nr_pages, altmap);
> +	return section_empty;
>  }
>  
>  static struct page * __meminit section_activate(int nid, unsigned long pfn,
> @@ -834,7 +854,11 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
>  
>  	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
>  	if (!memmap) {
> -		section_deactivate(pfn, nr_pages, altmap);
> +		int ret;
> +
> +		ret = section_deactivate(pfn, nr_pages);
> +		if (ret >= 0)
> +			section_remove(pfn, nr_pages, altmap, ret);
>  		return ERR_PTR(-ENOMEM);
>  	}
>  
> @@ -919,12 +943,17 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
>  }
>  #endif
>  
> -void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
> -		unsigned long nr_pages, unsigned long map_offset,
> -		struct vmem_altmap *altmap)
> +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages)
> +{
> +	return section_deactivate(pfn, nr_pages);
> +}
> +
> +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
> +			   unsigned long map_offset, struct vmem_altmap *altmap,
> +			   int section_empty)
>  {
>  	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
>  			nr_pages - map_offset);
> -	section_deactivate(pfn, nr_pages, altmap);
> +	section_remove(pfn, nr_pages, altmap, section_empty);
>  }
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> -- 
> 2.12.3
Oscar Salvador July 15, 2019, 9:24 p.m. UTC | #2
On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote:
> Oscar Salvador <osalvador@suse.de> writes:
> 
> > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION
> > granularity.
> > The problem is that deactivation of the section occurs later on in
> > sparse_remove_section, so pfn_valid()->pfn_section_valid() will
> > always return
> > true before we deactivate the {sub}section.
> 
> Can you explain this more? The patch doesn't update section_mem_map
> update sequence. So what changed? What is the problem in finding
> pfn_valid() return true there?

I realized that the changelog was quite modest, so a better explanation
 will follow.

Let us analize what shrink_{zone,node}_span does.
We have to remember that shrink_zone_span gets called every time a
section is to be removed.

There can be three possibilites:

1) section to be removed is the first one of the zone
2) section to be removed is the last one of the zone
3) section to be removed falls in the middle
 
For 1) and 2) cases, we will try to find the next section from
bottom/top, and in the third case we will check whether the section
contains only holes.

Now, let us take the example where a ZONE contains only 1 section, and
we remove it.
The last loop of shrink_zone_span, will check for {start_pfn,end_pfn]
PAGES_PER_SECTION block the following:

- section is valid
- pfn relates to the current zone/nid
- section is not the section to be removed

Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done.

Now, what happens after the patch?

We increment pfn on subsection basis, since "start_pfn == pfn", we jump
to the next sub-section (pfn+512), and call pfn_valid()-
>pfn_section_valid().
Since section has not been yet deactivded, pfn_section_valid() will
return true, and we will repeat this until the end of the loop.

What should happen instead is:

- we deactivate the {sub}-section before calling
shirnk_{zone,node}_span
- calls to pfn_valid() will now return false for the sections that have
been deactivated, and so we will get the pfn from the next activaded
sub-section, or nothing if the section is empty (section do not contain
active sub-sections).

The example relates to the last loop in shrink_zone_span, but the same
applies to find_{smalles,biggest}_section.

Please, note that we could probably do some hack like replacing:

start_pfn == pfn 

with

pfn < end_pfn

But the way to fix this is to 1) deactivate {sub}-section and 2) let
shrink_{node,zone}_span find the next active {sub-section}.

I hope this makes it more clear.
Dan Williams July 17, 2019, 2:28 a.m. UTC | #3
On Mon, Jul 15, 2019 at 2:24 PM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote:
> > Oscar Salvador <osalvador@suse.de> writes:
> >
> > > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION
> > > granularity.
> > > The problem is that deactivation of the section occurs later on in
> > > sparse_remove_section, so pfn_valid()->pfn_section_valid() will
> > > always return
> > > true before we deactivate the {sub}section.
> >
> > Can you explain this more? The patch doesn't update section_mem_map
> > update sequence. So what changed? What is the problem in finding
> > pfn_valid() return true there?
>
> I realized that the changelog was quite modest, so a better explanation
>  will follow.
>
> Let us analize what shrink_{zone,node}_span does.
> We have to remember that shrink_zone_span gets called every time a
> section is to be removed.
>
> There can be three possibilites:
>
> 1) section to be removed is the first one of the zone
> 2) section to be removed is the last one of the zone
> 3) section to be removed falls in the middle
>
> For 1) and 2) cases, we will try to find the next section from
> bottom/top, and in the third case we will check whether the section
> contains only holes.
>
> Now, let us take the example where a ZONE contains only 1 section, and
> we remove it.
> The last loop of shrink_zone_span, will check for {start_pfn,end_pfn]
> PAGES_PER_SECTION block the following:
>
> - section is valid
> - pfn relates to the current zone/nid
> - section is not the section to be removed
>
> Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done.
>
> Now, what happens after the patch?
>
> We increment pfn on subsection basis, since "start_pfn == pfn", we jump
> to the next sub-section (pfn+512), and call pfn_valid()-
> >pfn_section_valid().
> Since section has not been yet deactivded, pfn_section_valid() will
> return true, and we will repeat this until the end of the loop.
>
> What should happen instead is:
>
> - we deactivate the {sub}-section before calling
> shirnk_{zone,node}_span
> - calls to pfn_valid() will now return false for the sections that have
> been deactivated, and so we will get the pfn from the next activaded
> sub-section, or nothing if the section is empty (section do not contain
> active sub-sections).
>
> The example relates to the last loop in shrink_zone_span, but the same
> applies to find_{smalles,biggest}_section.
>
> Please, note that we could probably do some hack like replacing:
>
> start_pfn == pfn
>
> with
>
> pfn < end_pfn
>
> But the way to fix this is to 1) deactivate {sub}-section and 2) let
> shrink_{node,zone}_span find the next active {sub-section}.
>
> I hope this makes it more clear.

This makes it more clear that the problem is with the "start_pfn ==
pfn" check relative to subsections, but it does not clarify why it
needs to clear pfn_valid() before calling shrink_zone_span().
Sections were not invalidated prior to shrink_zone_span() in the
pre-subsection implementation and it seems all we need is to keep the
same semantic. I.e. skip the range that is currently being removed:

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 37d49579ac15..b69832db442b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,8 +422,8 @@ static void shrink_zone_span(struct zone *zone,
unsigned long start_pfn,
                if (page_zone(pfn_to_page(pfn)) != zone)
                        continue;

-                /* If the section is current section, it continues the loop */
-               if (start_pfn == pfn)
+                /* If the sub-section is current span being removed, skip */
+               if (pfn >= start_pfn && pfn < end_pfn)
                        continue;

                /* If we find valid section, we have nothing to do */


I otherwise don't follow why we would need to deactivate prior to
__remove_zone().
Aneesh Kumar K.V July 17, 2019, 5:38 a.m. UTC | #4
Oscar Salvador <osalvador@suse.de> writes:

> On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote:
>> Oscar Salvador <osalvador@suse.de> writes:
>> 
>> > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION
>> > granularity.
>> > The problem is that deactivation of the section occurs later on in
>> > sparse_remove_section, so pfn_valid()->pfn_section_valid() will
>> > always return
>> > true before we deactivate the {sub}section.
>> 
>> Can you explain this more? The patch doesn't update section_mem_map
>> update sequence. So what changed? What is the problem in finding
>> pfn_valid() return true there?
>
> I realized that the changelog was quite modest, so a better explanation
>  will follow.
>
> Let us analize what shrink_{zone,node}_span does.
> We have to remember that shrink_zone_span gets called every time a
> section is to be removed.
>
> There can be three possibilites:
>
> 1) section to be removed is the first one of the zone
> 2) section to be removed is the last one of the zone
> 3) section to be removed falls in the middle
>  
> For 1) and 2) cases, we will try to find the next section from
> bottom/top, and in the third case we will check whether the section
> contains only holes.
>
> Now, let us take the example where a ZONE contains only 1 section, and
> we remove it.
> The last loop of shrink_zone_span, will check for {start_pfn,end_pfn]
> PAGES_PER_SECTION block the following:
>
> - section is valid
> - pfn relates to the current zone/nid
> - section is not the section to be removed
>
> Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done.
>
> Now, what happens after the patch?
>
> We increment pfn on subsection basis, since "start_pfn == pfn", we jump
> to the next sub-section (pfn+512), and call pfn_valid()-
>>pfn_section_valid().
> Since section has not been yet deactivded, pfn_section_valid() will
> return true, and we will repeat this until the end of the loop.
>
> What should happen instead is:
>
> - we deactivate the {sub}-section before calling
> shirnk_{zone,node}_span
> - calls to pfn_valid() will now return false for the sections that have
> been deactivated, and so we will get the pfn from the next activaded
> sub-section, or nothing if the section is empty (section do not contain
> active sub-sections).
>
> The example relates to the last loop in shrink_zone_span, but the same
> applies to find_{smalles,biggest}_section.
>
> Please, note that we could probably do some hack like replacing:
>
> start_pfn == pfn 
>
> with
>
> pfn < end_pfn

Why do you consider this a hack? 

 /* If the section is current section, it continues the loop */
	if (start_pfn == pfn)
		continue;

The comment explains that check is there to handle the exact scenario
that you are fixing in this patch. With subsection patch that check is
not sufficient. Shouldn't we just fix the check to handle that?

Not sure about your comment w.r.t find_{smalles,biggest}_section. We
search with pfn range outside the subsection we are trying to remove.
So this should not have an impact there?


>
> But the way to fix this is to 1) deactivate {sub}-section and 2) let
> shrink_{node,zone}_span find the next active {sub-section}.
>
> I hope this makes it more clear.

-aneesh
Oscar Salvador July 17, 2019, 7:38 a.m. UTC | #5
On Tue, Jul 16, 2019 at 07:28:54PM -0700, Dan Williams wrote:
> This makes it more clear that the problem is with the "start_pfn ==
> pfn" check relative to subsections, but it does not clarify why it
> needs to clear pfn_valid() before calling shrink_zone_span().
> Sections were not invalidated prior to shrink_zone_span() in the
> pre-subsection implementation and it seems all we need is to keep the
> same semantic. I.e. skip the range that is currently being removed:

Yes, as I said in my reply to Aneesh, that is the other way I thought
when fixing it.
The reason I went this way is because it seemed more reasonable and
natural to me that pfn_valid() would just return the next active
sub-section.

I just though that we could leverage the fact that we can deactivate
a sub-section before scanning for the next one.

On a second thought, the changes do not outweight the case, being the first
fix enough and less intrusive, so I will send a v2 with that instead.
Oscar Salvador July 17, 2019, 7:42 a.m. UTC | #6
On Wed, Jul 17, 2019 at 11:08:54AM +0530, Aneesh Kumar K.V wrote:
> Oscar Salvador <osalvador@suse.de> writes:
> 
> > On Mon, 2019-07-15 at 21:41 +0530, Aneesh Kumar K.V wrote:
> >> Oscar Salvador <osalvador@suse.de> writes:
> >> 
> >> > Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION
> >> > granularity.
> >> > The problem is that deactivation of the section occurs later on in
> >> > sparse_remove_section, so pfn_valid()->pfn_section_valid() will
> >> > always return
> >> > true before we deactivate the {sub}section.
> >> 
> >> Can you explain this more? The patch doesn't update section_mem_map
> >> update sequence. So what changed? What is the problem in finding
> >> pfn_valid() return true there?
> >
> > I realized that the changelog was quite modest, so a better explanation
> >  will follow.
> >
> > Let us analize what shrink_{zone,node}_span does.
> > We have to remember that shrink_zone_span gets called every time a
> > section is to be removed.
> >
> > There can be three possibilites:
> >
> > 1) section to be removed is the first one of the zone
> > 2) section to be removed is the last one of the zone
> > 3) section to be removed falls in the middle
> >  
> > For 1) and 2) cases, we will try to find the next section from
> > bottom/top, and in the third case we will check whether the section
> > contains only holes.
> >
> > Now, let us take the example where a ZONE contains only 1 section, and
> > we remove it.
> > The last loop of shrink_zone_span, will check for {start_pfn,end_pfn]
> > PAGES_PER_SECTION block the following:
> >
> > - section is valid
> > - pfn relates to the current zone/nid
> > - section is not the section to be removed
> >
> > Since we only got 1 section here, the check "start_pfn == pfn" will make us to continue the loop and then we are done.
> >
> > Now, what happens after the patch?
> >
> > We increment pfn on subsection basis, since "start_pfn == pfn", we jump
> > to the next sub-section (pfn+512), and call pfn_valid()-
> >>pfn_section_valid().
> > Since section has not been yet deactivded, pfn_section_valid() will
> > return true, and we will repeat this until the end of the loop.
> >
> > What should happen instead is:
> >
> > - we deactivate the {sub}-section before calling
> > shirnk_{zone,node}_span
> > - calls to pfn_valid() will now return false for the sections that have
> > been deactivated, and so we will get the pfn from the next activaded
> > sub-section, or nothing if the section is empty (section do not contain
> > active sub-sections).
> >
> > The example relates to the last loop in shrink_zone_span, but the same
> > applies to find_{smalles,biggest}_section.
> >
> > Please, note that we could probably do some hack like replacing:
> >
> > start_pfn == pfn 
> >
> > with
> >
> > pfn < end_pfn
> 
> Why do you consider this a hack? 
> 
>  /* If the section is current section, it continues the loop */
> 	if (start_pfn == pfn)
> 		continue;

I did not consider this a hack, but I really did not like to adapt that
to the sub-section case as it seemed more natural to 1) deactivate
sub-section and 2) look for the next one.
So we would not need these checks.

I might have bored at that time and I went for the most complex way to fix
it.

I will send v2 with the less intrusive check.

> 
> The comment explains that check is there to handle the exact scenario
> that you are fixing in this patch. With subsection patch that check is
> not sufficient. Shouldn't we just fix the check to handle that?
> 
> Not sure about your comment w.r.t find_{smalles,biggest}_section. We
> search with pfn range outside the subsection we are trying to remove.
> So this should not have an impact there?

Yeah, I overlooked the code.
David Hildenbrand July 17, 2019, 8:01 a.m. UTC | #7
On 17.07.19 09:38, Oscar Salvador wrote:
> On Tue, Jul 16, 2019 at 07:28:54PM -0700, Dan Williams wrote:
>> This makes it more clear that the problem is with the "start_pfn ==
>> pfn" check relative to subsections, but it does not clarify why it
>> needs to clear pfn_valid() before calling shrink_zone_span().
>> Sections were not invalidated prior to shrink_zone_span() in the
>> pre-subsection implementation and it seems all we need is to keep the
>> same semantic. I.e. skip the range that is currently being removed:
> 
> Yes, as I said in my reply to Aneesh, that is the other way I thought
> when fixing it.
> The reason I went this way is because it seemed more reasonable and
> natural to me that pfn_valid() would just return the next active
> sub-section.
> 
> I just though that we could leverage the fact that we can deactivate
> a sub-section before scanning for the next one.
> 
> On a second thought, the changes do not outweight the case, being the first
> fix enough and less intrusive, so I will send a v2 with that instead.
> 
> 

I'd also like to note that we should strive for making all zone-related
changes when offlining in the future, not when removing memory. So
ideally, any core changes we perform from now, should make that step
(IOW implementing that) easier, not harder. Of course, BUGs have to be
fixed.

The rough idea would be to also mark ZONE_DEVICE sections as ONLINE (or
rather rename it to "ACTIVE" to generalize).

For each section we would then have

- pfn_valid(): We have a valid "struct page" / memmap
- pfn_present(): We have actually added that memory via an oficial
  interface to mm (e.g., arch_add_memory() )
- pfn_online() / (or pfn_active()): Memory is active (online in "buddy"-
  speak, or memory that was moved to the ZONE_DEVICE zone)

When resizing the zones (e.g., when offlining memory), we would then
search for pfn_online(), not pfn_present().

In addition to move_pfn_range_to_zone(), we would have
remove_pfn_range_from_zone(), called during offline_pages() / by
devmem/hmm/pmem code before removing.

(I started to look into this, but I don't have any patches yet)
Oscar Salvador July 17, 2019, 8:08 a.m. UTC | #8
On Wed, Jul 17, 2019 at 10:01:01AM +0200, David Hildenbrand wrote:
> I'd also like to note that we should strive for making all zone-related
> changes when offlining in the future, not when removing memory. So
> ideally, any core changes we perform from now, should make that step
> (IOW implementing that) easier, not harder. Of course, BUGs have to be
> fixed.
> 
> The rough idea would be to also mark ZONE_DEVICE sections as ONLINE (or
> rather rename it to "ACTIVE" to generalize).
> 
> For each section we would then have
> 
> - pfn_valid(): We have a valid "struct page" / memmap
> - pfn_present(): We have actually added that memory via an oficial
>   interface to mm (e.g., arch_add_memory() )
> - pfn_online() / (or pfn_active()): Memory is active (online in "buddy"-
>   speak, or memory that was moved to the ZONE_DEVICE zone)
> 
> When resizing the zones (e.g., when offlining memory), we would then
> search for pfn_online(), not pfn_present().
> 
> In addition to move_pfn_range_to_zone(), we would have
> remove_pfn_range_from_zone(), called during offline_pages() / by
> devmem/hmm/pmem code before removing.
> 
> (I started to look into this, but I don't have any patches yet)

Yes, it seems like a good approach, and something that makes sense
to pursue.
FWIF, I sent a patchset [1] for that a long time ago, but I could not
follow-up due to time constraints.

[1] https://patchwork.kernel.org/cover/10700783/
Oscar Salvador July 18, 2019, 12:05 p.m. UTC | #9
On Mon, Jul 15, 2019 at 10:15:49AM +0200, Oscar Salvador wrote:
> Since [1], shrink_{zone,node}_span work on PAGES_PER_SUBSECTION granularity.
> The problem is that deactivation of the section occurs later on in
> sparse_remove_section, so pfn_valid()->pfn_section_valid() will always return
> true before we deactivate the {sub}section.
> 
> I spotted this during hotplug hotremove tests, there I always saw that
> spanned_pages was, at least, left with PAGES_PER_SECTION, even if we
> removed all memory linked to that zone.
> 
> Fix this by decoupling section_deactivate from sparse_remove_section, and
> re-order the function calls.
> 
> Now, __remove_section will:
> 
> 1) deactivate section
> 2) shrink {zone,node}'s pages
> 3) remove section
> 
> [1] https://patchwork.kernel.org/patch/11003467/

Hi Andrew,

Please, drop this patch as patch [1] is the easiest way to fix this.

thanks a lot

[1] https://patchwork.kernel.org/patch/11047499/

> 
> Fixes: mmotm ("mm/hotplug: prepare shrink_{zone, pgdat}_span for sub-section removal")
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>  include/linux/memory_hotplug.h |  7 ++--
>  mm/memory_hotplug.c            |  6 +++-
>  mm/sparse.c                    | 77 +++++++++++++++++++++++++++++-------------
>  3 files changed, 62 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index f46ea71b4ffd..d2eb917aad5f 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -348,9 +348,10 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
>  extern bool is_memblock_offlined(struct memory_block *mem);
>  extern int sparse_add_section(int nid, unsigned long pfn,
>  		unsigned long nr_pages, struct vmem_altmap *altmap);
> -extern void sparse_remove_section(struct mem_section *ms,
> -		unsigned long pfn, unsigned long nr_pages,
> -		unsigned long map_offset, struct vmem_altmap *altmap);
> +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages);
> +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
> +                           unsigned long map_offset, struct vmem_altmap *altmap,
> +                           int section_empty);
>  extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
>  					  unsigned long pnum);
>  extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index b9ba5b85f9f7..03d535eee60d 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -517,12 +517,16 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
>  		struct vmem_altmap *altmap)
>  {
>  	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
> +	int ret;
>  
>  	if (WARN_ON_ONCE(!valid_section(ms)))
>  		return;
>  
> +	ret = sparse_deactivate_section(pfn, nr_pages);
>  	__remove_zone(zone, pfn, nr_pages);
> -	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
> +	if (ret >= 0)
> +		sparse_remove_section(pfn, nr_pages, map_offset, altmap,
> +				      ret);
>  }
>  
>  /**
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 1e224149aab6..d4953ee1d087 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -732,16 +732,47 @@ static void free_map_bootmem(struct page *memmap)
>  }
>  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
>  
> -static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
> -		struct vmem_altmap *altmap)
> +static void section_remove(unsigned long pfn, unsigned long nr_pages,
> +			   struct vmem_altmap *altmap, int section_empty)
> +{
> +	struct mem_section *ms = __pfn_to_section(pfn);
> +	bool section_early = early_section(ms);
> +	struct page *memmap = NULL;
> +
> +	if (section_empty) {
> +		unsigned long section_nr = pfn_to_section_nr(pfn);
> +
> +		if (!section_early) {
> +			kfree(ms->usage);
> +			ms->usage = NULL;
> +		}
> +		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
> +		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
> +	}
> +
> +        if (section_early && memmap)
> +		free_map_bootmem(memmap);
> +        else
> +		depopulate_section_memmap(pfn, nr_pages, altmap);
> +}
> +
> +/**
> + * section_deactivate: Deactivate a {sub}section.
> + *
> + * Return:
> + * * -1         - {sub}section has already been deactivated.
> + * * 0          - Section is not empty
> + * * 1          - Section is empty
> + */
> +
> +static int section_deactivate(unsigned long pfn, unsigned long nr_pages)
>  {
>  	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
>  	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
>  	struct mem_section *ms = __pfn_to_section(pfn);
> -	bool section_is_early = early_section(ms);
> -	struct page *memmap = NULL;
>  	unsigned long *subsection_map = ms->usage
>  		? &ms->usage->subsection_map[0] : NULL;
> +	int section_empty = 0;
>  
>  	subsection_mask_set(map, pfn, nr_pages);
>  	if (subsection_map)
> @@ -750,7 +781,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>  	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
>  				"section already deactivated (%#lx + %ld)\n",
>  				pfn, nr_pages))
> -		return;
> +		return -1;
>  
>  	/*
>  	 * There are 3 cases to handle across two configurations
> @@ -770,21 +801,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>  	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
>  	 */
>  	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
> -	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
> -		unsigned long section_nr = pfn_to_section_nr(pfn);
> -
> -		if (!section_is_early) {
> -			kfree(ms->usage);
> -			ms->usage = NULL;
> -		}
> -		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
> -		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
> -	}
> +	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION))
> +		section_empty = 1;
>  
> -	if (section_is_early && memmap)
> -		free_map_bootmem(memmap);
> -	else
> -		depopulate_section_memmap(pfn, nr_pages, altmap);
> +	return section_empty;
>  }
>  
>  static struct page * __meminit section_activate(int nid, unsigned long pfn,
> @@ -834,7 +854,11 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
>  
>  	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
>  	if (!memmap) {
> -		section_deactivate(pfn, nr_pages, altmap);
> +		int ret;
> +
> +		ret = section_deactivate(pfn, nr_pages);
> +		if (ret >= 0)
> +			section_remove(pfn, nr_pages, altmap, ret);
>  		return ERR_PTR(-ENOMEM);
>  	}
>  
> @@ -919,12 +943,17 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
>  }
>  #endif
>  
> -void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
> -		unsigned long nr_pages, unsigned long map_offset,
> -		struct vmem_altmap *altmap)
> +int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages)
> +{
> +	return section_deactivate(pfn, nr_pages);
> +}
> +
> +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
> +			   unsigned long map_offset, struct vmem_altmap *altmap,
> +			   int section_empty)
>  {
>  	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
>  			nr_pages - map_offset);
> -	section_deactivate(pfn, nr_pages, altmap);
> +	section_remove(pfn, nr_pages, altmap, section_empty);
>  }
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> -- 
> 2.12.3
>

Patch
diff mbox series

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f46ea71b4ffd..d2eb917aad5f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -348,9 +348,10 @@  extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_section(int nid, unsigned long pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
-extern void sparse_remove_section(struct mem_section *ms,
-		unsigned long pfn, unsigned long nr_pages,
-		unsigned long map_offset, struct vmem_altmap *altmap);
+int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages);
+void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
+                           unsigned long map_offset, struct vmem_altmap *altmap,
+                           int section_empty);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9ba5b85f9f7..03d535eee60d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -517,12 +517,16 @@  static void __remove_section(struct zone *zone, unsigned long pfn,
 		struct vmem_altmap *altmap)
 {
 	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
+	int ret;
 
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
+	ret = sparse_deactivate_section(pfn, nr_pages);
 	__remove_zone(zone, pfn, nr_pages);
-	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
+	if (ret >= 0)
+		sparse_remove_section(pfn, nr_pages, map_offset, altmap,
+				      ret);
 }
 
 /**
diff --git a/mm/sparse.c b/mm/sparse.c
index 1e224149aab6..d4953ee1d087 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -732,16 +732,47 @@  static void free_map_bootmem(struct page *memmap)
 }
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap)
+static void section_remove(unsigned long pfn, unsigned long nr_pages,
+			   struct vmem_altmap *altmap, int section_empty)
+{
+	struct mem_section *ms = __pfn_to_section(pfn);
+	bool section_early = early_section(ms);
+	struct page *memmap = NULL;
+
+	if (section_empty) {
+		unsigned long section_nr = pfn_to_section_nr(pfn);
+
+		if (!section_early) {
+			kfree(ms->usage);
+			ms->usage = NULL;
+		}
+		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+	}
+
+        if (section_early && memmap)
+		free_map_bootmem(memmap);
+        else
+		depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+/**
+ * section_deactivate: Deactivate a {sub}section.
+ *
+ * Return:
+ * * -1         - {sub}section has already been deactivated.
+ * * 0          - Section is not empty
+ * * 1          - Section is empty
+ */
+
+static int section_deactivate(unsigned long pfn, unsigned long nr_pages)
 {
 	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
 	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
 	struct mem_section *ms = __pfn_to_section(pfn);
-	bool section_is_early = early_section(ms);
-	struct page *memmap = NULL;
 	unsigned long *subsection_map = ms->usage
 		? &ms->usage->subsection_map[0] : NULL;
+	int section_empty = 0;
 
 	subsection_mask_set(map, pfn, nr_pages);
 	if (subsection_map)
@@ -750,7 +781,7 @@  static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
 				"section already deactivated (%#lx + %ld)\n",
 				pfn, nr_pages))
-		return;
+		return -1;
 
 	/*
 	 * There are 3 cases to handle across two configurations
@@ -770,21 +801,10 @@  static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
 	 */
 	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
-	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
-		unsigned long section_nr = pfn_to_section_nr(pfn);
-
-		if (!section_is_early) {
-			kfree(ms->usage);
-			ms->usage = NULL;
-		}
-		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
-	}
+	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION))
+		section_empty = 1;
 
-	if (section_is_early && memmap)
-		free_map_bootmem(memmap);
-	else
-		depopulate_section_memmap(pfn, nr_pages, altmap);
+	return section_empty;
 }
 
 static struct page * __meminit section_activate(int nid, unsigned long pfn,
@@ -834,7 +854,11 @@  static struct page * __meminit section_activate(int nid, unsigned long pfn,
 
 	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
 	if (!memmap) {
-		section_deactivate(pfn, nr_pages, altmap);
+		int ret;
+
+		ret = section_deactivate(pfn, nr_pages);
+		if (ret >= 0)
+			section_remove(pfn, nr_pages, altmap, ret);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -919,12 +943,17 @@  static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
-		unsigned long nr_pages, unsigned long map_offset,
-		struct vmem_altmap *altmap)
+int sparse_deactivate_section(unsigned long pfn, unsigned long nr_pages)
+{
+	return section_deactivate(pfn, nr_pages);
+}
+
+void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
+			   unsigned long map_offset, struct vmem_altmap *altmap,
+			   int section_empty)
 {
 	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
 			nr_pages - map_offset);
-	section_deactivate(pfn, nr_pages, altmap);
+	section_remove(pfn, nr_pages, altmap, section_empty);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */