diff mbox series

[v8,09/12] mm/sparsemem: Support sub-section hotplug

Message ID 155718601407.130019.14248061058774128227.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show
Series mm: Sub-section memory hotplug support | expand

Commit Message

Dan Williams May 6, 2019, 11:40 p.m. UTC
The libnvdimm sub-system has suffered a series of hacks and broken
workarounds for the memory-hotplug implementation's awkward
section-aligned (128MB) granularity. For example the following backtrace
is emitted when attempting arch_add_memory() with physical address
ranges that intersect 'System RAM' (RAM) with 'Persistent Memory' (PMEM)
within a given section:

 WARNING: CPU: 0 PID: 558 at kernel/memremap.c:300 devm_memremap_pages+0x3b5/0x4c0
 devm_memremap_pages attempted on mixed region [mem 0x200000000-0x2fbffffff flags 0x200]
 [..]
 Call Trace:
   dump_stack+0x86/0xc3
   __warn+0xcb/0xf0
   warn_slowpath_fmt+0x5f/0x80
   devm_memremap_pages+0x3b5/0x4c0
   __wrap_devm_memremap_pages+0x58/0x70 [nfit_test_iomap]
   pmem_attach_disk+0x19a/0x440 [nd_pmem]

Recently it was discovered that the problem goes beyond RAM vs PMEM
collisions as some platform produce PMEM vs PMEM collisions within a
given section. The libnvdimm workaround for that case revealed that the
libnvdimm section-alignment-padding implementation has been broken for a
long while. A fix for that long-standing breakage introduces as many
problems as it solves as it would require a backward-incompatible change
to the namespace metadata interpretation. Instead of that dubious route
[1], address the root problem in the memory-hotplug implementation.

[1]: https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 mm/sparse.c |  255 ++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 175 insertions(+), 80 deletions(-)

Comments

Oscar Salvador May 8, 2019, 11:15 p.m. UTC | #1
On Mon, 2019-05-06 at 16:40 -0700, Dan Williams wrote:
> @@ -741,49 +895,31 @@ int __meminit sparse_add_section(int nid,
> unsigned long start_pfn,
>  		unsigned long nr_pages, struct vmem_altmap *altmap)
>  {
>  	unsigned long section_nr = pfn_to_section_nr(start_pfn);
> -	struct mem_section_usage *usage;
>  	struct mem_section *ms;
>  	struct page *memmap;
>  	int ret;

I already pointed this out in v7, but:

>  
> -	/*
> -	 * no locking for this, because it does its own
> -	 * plus, it does a kmalloc
> -	 */
>  	ret = sparse_index_init(section_nr, nid);
>  	if (ret < 0 && ret != -EEXIST)
>  		return ret;

sparse_index_init() only returns either -ENOMEM or 0, so the above can
be:

	if (ret < 0) or if (ret)

> -	ret = 0;
> -	memmap = populate_section_memmap(start_pfn,
> PAGES_PER_SECTION, nid,
> -			altmap);
> -	if (!memmap)
> -		return -ENOMEM;
> -	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
> -	if (!usage) {
> -		depopulate_section_memmap(start_pfn,
> PAGES_PER_SECTION, altmap);
> -		return -ENOMEM;
> -	}
>  
> -	ms = __pfn_to_section(start_pfn);
> -	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
> -		ret = -EEXIST;
> -		goto out;
> -	}
> +	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
> +	if (IS_ERR(memmap))
> +		return PTR_ERR(memmap);
> +	ret = 0;

If we got here, sparse_index_init must have returned 0, so ret already
contains 0.
We can remove the assignment.

>  
>  	/*
>  	 * Poison uninitialized struct pages in order to catch
> invalid flags
>  	 * combinations.
>  	 */
> -	page_init_poison(memmap, sizeof(struct page) *
> PAGES_PER_SECTION);
> +	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page)
> * nr_pages);
>  
> +	ms = __pfn_to_section(start_pfn);
>  	section_mark_present(ms);
> -	sparse_init_one_section(ms, section_nr, memmap, usage);
> +	sparse_init_one_section(ms, section_nr, memmap, ms->usage);
>  
> -out:
> -	if (ret < 0) {
> -		kfree(usage);
> -		depopulate_section_memmap(start_pfn,
> PAGES_PER_SECTION, altmap);
> -	}
> +	if (ret < 0)
> +		section_deactivate(start_pfn, nr_pages, nid,
> altmap);

AFAICS, ret is only set by the return code from sparse_index_init, so
we cannot really get to this code being ret different than 0.
So we can remove the above two lines.

I will start reviewing the patches that lack review from this version
soon.
Oscar Salvador May 13, 2019, 1:54 p.m. UTC | #2
On Mon, May 06, 2019 at 04:40:14PM -0700, Dan Williams wrote:
>  
> +void subsection_mask_set(unsigned long *map, unsigned long pfn,
> +		unsigned long nr_pages)
> +{
> +	int idx = subsection_map_index(pfn);
> +	int end = subsection_map_index(pfn + nr_pages - 1);
> +
> +	bitmap_set(map, idx, end - idx + 1);
> +}
> +
>  void subsection_map_init(unsigned long pfn, unsigned long nr_pages)
>  {
>  	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
> @@ -219,20 +235,17 @@ void subsection_map_init(unsigned long pfn, unsigned long nr_pages)
>  		return;
>  
>  	for (i = start_sec; i <= end_sec; i++) {
> -		int idx, end;
> -		unsigned long pfns;
>  		struct mem_section *ms;
> +		unsigned long pfns;
>  
> -		idx = subsection_map_index(pfn);
>  		pfns = min(nr_pages, PAGES_PER_SECTION
>  				- (pfn & ~PAGE_SECTION_MASK));
> -		end = subsection_map_index(pfn + pfns - 1);
> -
>  		ms = __nr_to_section(i);
> -		bitmap_set(ms->usage->subsection_map, idx, end - idx + 1);
> +		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
>  
>  		pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
> -				pfns, idx, end - idx + 1);
> +				pfns, subsection_map_index(pfn),
> +				subsection_map_index(pfn + pfns - 1));

I would definetely add subsection_mask_set() and above change to Patch#3.
I think it suits there better than here.

>  
>  		pfn += pfns;
>  		nr_pages -= pfns;
> @@ -319,6 +332,15 @@ static void __meminit sparse_init_one_section(struct mem_section *ms,
>  		unsigned long pnum, struct page *mem_map,
>  		struct mem_section_usage *usage)
>  {
> +	/*
> +	 * Given that SPARSEMEM_VMEMMAP=y supports sub-section hotplug,
> +	 * ->section_mem_map can not be guaranteed to point to a full
> +	 *  section's worth of memory.  The field is only valid / used
> +	 *  in the SPARSEMEM_VMEMMAP=n case.
> +	 */
> +	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
> +		mem_map = NULL;
> +
>  	ms->section_mem_map &= ~SECTION_MAP_MASK;
>  	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
>  							SECTION_HAS_MEM_MAP;
> @@ -724,10 +746,142 @@ static void free_map_bootmem(struct page *memmap)
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
>  
> +#ifndef CONFIG_MEMORY_HOTREMOVE
> +static void free_map_bootmem(struct page *memmap)
> +{
> +}
> +#endif
> +
> +static bool is_early_section(struct mem_section *ms)
> +{
> +	struct page *usage_page;
> +
> +	usage_page = virt_to_page(ms->usage);
> +	if (PageSlab(usage_page) || PageCompound(usage_page))
> +		return false;
> +	else
> +		return true;
> +}
> +
> +static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
> +		int nid, struct vmem_altmap *altmap)
> +{
> +	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
> +	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
> +	struct mem_section *ms = __pfn_to_section(pfn);
> +	bool early_section = is_early_section(ms);
> +	struct page *memmap = NULL;
> +	unsigned long *subsection_map = ms->usage
> +		? &ms->usage->subsection_map[0] : NULL;
> +
> +	subsection_mask_set(map, pfn, nr_pages);
> +	if (subsection_map)
> +		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
> +
> +	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
> +				"section already deactivated (%#lx + %ld)\n",
> +				pfn, nr_pages))
> +		return;
> +
> +	if (WARN(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)
> +				&& nr_pages < PAGES_PER_SECTION,
> +				"partial memory section removal not supported\n"))
> +		return;
> +
> +	/*
> +	 * There are 3 cases to handle across two configurations
> +	 * (SPARSEMEM_VMEMMAP={y,n}):
> +	 *
> +	 * 1/ deactivation of a partial hot-added section (only possible
> +	 * in the SPARSEMEM_VMEMMAP=y case).
> +	 *    a/ section was present at memory init
> +	 *    b/ section was hot-added post memory init
> +	 * 2/ deactivation of a complete hot-added section
> +	 * 3/ deactivation of a complete section from memory init
> +	 *
> +	 * For 1/, when subsection_map does not empty we will not be
> +	 * freeing the usage map, but still need to free the vmemmap
> +	 * range.
> +	 *
> +	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
> +	 */
> +	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
> +	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
> +		unsigned long section_nr = pfn_to_section_nr(pfn);
> +
> +		if (!early_section) {
> +			kfree(ms->usage);
> +			ms->usage = NULL;
> +		}
> +		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
> +		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
> +	}
> +
> +	if (early_section && memmap)
> +		free_map_bootmem(memmap);
> +	else
> +		depopulate_section_memmap(pfn, nr_pages, altmap);
> +}
> +
> +static struct page * __meminit section_activate(int nid, unsigned long pfn,
> +		unsigned long nr_pages, struct vmem_altmap *altmap)
> +{
> +	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
> +	struct mem_section *ms = __pfn_to_section(pfn);
> +	struct mem_section_usage *usage = NULL;
> +	unsigned long *subsection_map;
> +	struct page *memmap;
> +	int rc = 0;
> +
> +	subsection_mask_set(map, pfn, nr_pages);
> +
> +	if (!ms->usage) {
> +		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
> +		if (!usage)
> +			return ERR_PTR(-ENOMEM);
> +		ms->usage = usage;
> +	}
> +	subsection_map = &ms->usage->subsection_map[0];
> +
> +	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
> +		rc = -EINVAL;
> +	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
> +		rc = -EEXIST;
> +	else
> +		bitmap_or(subsection_map, map, subsection_map,
> +				SUBSECTIONS_PER_SECTION);
> +
> +	if (rc) {
> +		if (usage)
> +			ms->usage = NULL;
> +		kfree(usage);
> +		return ERR_PTR(rc);
> +	}
> +
> +	/*
> +	 * The early init code does not consider partially populated
> +	 * initial sections, it simply assumes that memory will never be
> +	 * referenced.  If we hot-add memory into such a section then we
> +	 * do not need to populate the memmap and can simply reuse what
> +	 * is already there.
> +	 */
> +	if (nr_pages < PAGES_PER_SECTION && is_early_section(ms))
> +		return pfn_to_page(pfn);
> +
> +	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
> +	if (!memmap) {
> +		section_deactivate(pfn, nr_pages, nid, altmap);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	return memmap;
> +}

I do not really like this.
Sub-section scheme is only available on CONFIG_SPARSE_VMEMMAP, so I would rather
have two internal __section_{activate,deactivate} functions for sparse-vmemmap and
sparse-non-vmemmap.
That way, we can hide all detail implementation and sub-section dance behind
the __section_{activate,deactivate} functions.

> +
> @@ -741,49 +895,31 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
>  		unsigned long nr_pages, struct vmem_altmap *altmap)
>  {
>  	unsigned long section_nr = pfn_to_section_nr(start_pfn);
> -	struct mem_section_usage *usage;
>  	struct mem_section *ms;
>  	struct page *memmap;
>  	int ret;
>  
> -	/*
> -	 * no locking for this, because it does its own
> -	 * plus, it does a kmalloc
> -	 */
>  	ret = sparse_index_init(section_nr, nid);
>  	if (ret < 0 && ret != -EEXIST)
>  		return ret;
> -	ret = 0;
> -	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
> -			altmap);
> -	if (!memmap)
> -		return -ENOMEM;
> -	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
> -	if (!usage) {
> -		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
> -		return -ENOMEM;
> -	}
>  
> -	ms = __pfn_to_section(start_pfn);
> -	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
> -		ret = -EEXIST;
> -		goto out;
> -	}
> +	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
> +	if (IS_ERR(memmap))
> +		return PTR_ERR(memmap);
> +	ret = 0;
>  
>  	/*
>  	 * Poison uninitialized struct pages in order to catch invalid flags
>  	 * combinations.
>  	 */
> -	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
> +	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
>  
> +	ms = __pfn_to_section(start_pfn);
>  	section_mark_present(ms);
> -	sparse_init_one_section(ms, section_nr, memmap, usage);
> +	sparse_init_one_section(ms, section_nr, memmap, ms->usage);
>  
> -out:
> -	if (ret < 0) {
> -		kfree(usage);
> -		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
> -	}
> +	if (ret < 0)
> +		section_deactivate(start_pfn, nr_pages, nid, altmap);
>  	return ret;
>  }

diff --git a/mm/sparse.c b/mm/sparse.c
index 34f322d14e62..daeb2d7d8dd0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -900,13 +900,12 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
        int ret;
 
        ret = sparse_index_init(section_nr, nid);
-       if (ret < 0 && ret != -EEXIST)
+       if (ret < 0)
                return ret;
 
        memmap = section_activate(nid, start_pfn, nr_pages, altmap);
        if (IS_ERR(memmap))
                return PTR_ERR(memmap);
-       ret = 0;
 
        /*
         * Poison uninitialized struct pages in order to catch invalid flags
@@ -918,8 +917,6 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
        section_mark_present(ms);
        sparse_init_one_section(ms, section_nr, memmap, ms->usage);
 
-       if (ret < 0)
-               section_deactivate(start_pfn, nr_pages, nid, altmap);
        return ret;
 }
Dan Williams June 4, 2019, 4:47 a.m. UTC | #3
On Mon, May 13, 2019 at 6:55 AM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Mon, May 06, 2019 at 04:40:14PM -0700, Dan Williams wrote:
> >
> > +void subsection_mask_set(unsigned long *map, unsigned long pfn,
> > +             unsigned long nr_pages)
> > +{
> > +     int idx = subsection_map_index(pfn);
> > +     int end = subsection_map_index(pfn + nr_pages - 1);
> > +
> > +     bitmap_set(map, idx, end - idx + 1);
> > +}
> > +
> >  void subsection_map_init(unsigned long pfn, unsigned long nr_pages)
> >  {
> >       int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
> > @@ -219,20 +235,17 @@ void subsection_map_init(unsigned long pfn, unsigned long nr_pages)
> >               return;
> >
> >       for (i = start_sec; i <= end_sec; i++) {
> > -             int idx, end;
> > -             unsigned long pfns;
> >               struct mem_section *ms;
> > +             unsigned long pfns;
> >
> > -             idx = subsection_map_index(pfn);
> >               pfns = min(nr_pages, PAGES_PER_SECTION
> >                               - (pfn & ~PAGE_SECTION_MASK));
> > -             end = subsection_map_index(pfn + pfns - 1);
> > -
> >               ms = __nr_to_section(i);
> > -             bitmap_set(ms->usage->subsection_map, idx, end - idx + 1);
> > +             subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
> >
> >               pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
> > -                             pfns, idx, end - idx + 1);
> > +                             pfns, subsection_map_index(pfn),
> > +                             subsection_map_index(pfn + pfns - 1));
>
> I would definetely add subsection_mask_set() and above change to Patch#3.
> I think it suits there better than here.

Yes, done.

>
> >
> >               pfn += pfns;
> >               nr_pages -= pfns;
> > @@ -319,6 +332,15 @@ static void __meminit sparse_init_one_section(struct mem_section *ms,
> >               unsigned long pnum, struct page *mem_map,
> >               struct mem_section_usage *usage)
> >  {
> > +     /*
> > +      * Given that SPARSEMEM_VMEMMAP=y supports sub-section hotplug,
> > +      * ->section_mem_map can not be guaranteed to point to a full
> > +      *  section's worth of memory.  The field is only valid / used
> > +      *  in the SPARSEMEM_VMEMMAP=n case.
> > +      */
> > +     if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
> > +             mem_map = NULL;
> > +
> >       ms->section_mem_map &= ~SECTION_MAP_MASK;
> >       ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
> >                                                       SECTION_HAS_MEM_MAP;
> > @@ -724,10 +746,142 @@ static void free_map_bootmem(struct page *memmap)
> >  #endif /* CONFIG_MEMORY_HOTREMOVE */
> >  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
> >
> > +#ifndef CONFIG_MEMORY_HOTREMOVE
> > +static void free_map_bootmem(struct page *memmap)
> > +{
> > +}
> > +#endif
> > +
> > +static bool is_early_section(struct mem_section *ms)
> > +{
> > +     struct page *usage_page;
> > +
> > +     usage_page = virt_to_page(ms->usage);
> > +     if (PageSlab(usage_page) || PageCompound(usage_page))
> > +             return false;
> > +     else
> > +             return true;
> > +}
> > +
> > +static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
> > +             int nid, struct vmem_altmap *altmap)
> > +{
> > +     DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
> > +     DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
> > +     struct mem_section *ms = __pfn_to_section(pfn);
> > +     bool early_section = is_early_section(ms);
> > +     struct page *memmap = NULL;
> > +     unsigned long *subsection_map = ms->usage
> > +             ? &ms->usage->subsection_map[0] : NULL;
> > +
> > +     subsection_mask_set(map, pfn, nr_pages);
> > +     if (subsection_map)
> > +             bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
> > +
> > +     if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
> > +                             "section already deactivated (%#lx + %ld)\n",
> > +                             pfn, nr_pages))
> > +             return;
> > +
> > +     if (WARN(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)
> > +                             && nr_pages < PAGES_PER_SECTION,
> > +                             "partial memory section removal not supported\n"))
> > +             return;
> > +
> > +     /*
> > +      * There are 3 cases to handle across two configurations
> > +      * (SPARSEMEM_VMEMMAP={y,n}):
> > +      *
> > +      * 1/ deactivation of a partial hot-added section (only possible
> > +      * in the SPARSEMEM_VMEMMAP=y case).
> > +      *    a/ section was present at memory init
> > +      *    b/ section was hot-added post memory init
> > +      * 2/ deactivation of a complete hot-added section
> > +      * 3/ deactivation of a complete section from memory init
> > +      *
> > +      * For 1/, when subsection_map does not empty we will not be
> > +      * freeing the usage map, but still need to free the vmemmap
> > +      * range.
> > +      *
> > +      * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
> > +      */
> > +     bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
> > +     if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
> > +             unsigned long section_nr = pfn_to_section_nr(pfn);
> > +
> > +             if (!early_section) {
> > +                     kfree(ms->usage);
> > +                     ms->usage = NULL;
> > +             }
> > +             memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
> > +             ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
> > +     }
> > +
> > +     if (early_section && memmap)
> > +             free_map_bootmem(memmap);
> > +     else
> > +             depopulate_section_memmap(pfn, nr_pages, altmap);
> > +}
> > +
> > +static struct page * __meminit section_activate(int nid, unsigned long pfn,
> > +             unsigned long nr_pages, struct vmem_altmap *altmap)
> > +{
> > +     DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
> > +     struct mem_section *ms = __pfn_to_section(pfn);
> > +     struct mem_section_usage *usage = NULL;
> > +     unsigned long *subsection_map;
> > +     struct page *memmap;
> > +     int rc = 0;
> > +
> > +     subsection_mask_set(map, pfn, nr_pages);
> > +
> > +     if (!ms->usage) {
> > +             usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
> > +             if (!usage)
> > +                     return ERR_PTR(-ENOMEM);
> > +             ms->usage = usage;
> > +     }
> > +     subsection_map = &ms->usage->subsection_map[0];
> > +
> > +     if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
> > +             rc = -EINVAL;
> > +     else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
> > +             rc = -EEXIST;
> > +     else
> > +             bitmap_or(subsection_map, map, subsection_map,
> > +                             SUBSECTIONS_PER_SECTION);
> > +
> > +     if (rc) {
> > +             if (usage)
> > +                     ms->usage = NULL;
> > +             kfree(usage);
> > +             return ERR_PTR(rc);
> > +     }
> > +
> > +     /*
> > +      * The early init code does not consider partially populated
> > +      * initial sections, it simply assumes that memory will never be
> > +      * referenced.  If we hot-add memory into such a section then we
> > +      * do not need to populate the memmap and can simply reuse what
> > +      * is already there.
> > +      */
> > +     if (nr_pages < PAGES_PER_SECTION && is_early_section(ms))
> > +             return pfn_to_page(pfn);
> > +
> > +     memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
> > +     if (!memmap) {
> > +             section_deactivate(pfn, nr_pages, nid, altmap);
> > +             return ERR_PTR(-ENOMEM);
> > +     }
> > +
> > +     return memmap;
> > +}
>
> I do not really like this.
> Sub-section scheme is only available on CONFIG_SPARSE_VMEMMAP, so I would rather
> have two internal __section_{activate,deactivate} functions for sparse-vmemmap and
> sparse-non-vmemmap.
> That way, we can hide all detail implementation and sub-section dance behind
> the __section_{activate,deactivate} functions.

I don't see the win for doing that. The code would have 2 places to
check when making a change that might apply to both cases. If one
function can handle both configurations then it makes sense to keep
them unified. However, this did prompt me to go kill the unnecessary
WARN(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP...) in section_deactivate()
since that is already covered by check_pfn_span() (formerly
subsection_check()).

>
> > +
> > @@ -741,49 +895,31 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
> >               unsigned long nr_pages, struct vmem_altmap *altmap)
> >  {
> >       unsigned long section_nr = pfn_to_section_nr(start_pfn);
> > -     struct mem_section_usage *usage;
> >       struct mem_section *ms;
> >       struct page *memmap;
> >       int ret;
> >
> > -     /*
> > -      * no locking for this, because it does its own
> > -      * plus, it does a kmalloc
> > -      */
> >       ret = sparse_index_init(section_nr, nid);
> >       if (ret < 0 && ret != -EEXIST)
> >               return ret;
> > -     ret = 0;
> > -     memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
> > -                     altmap);
> > -     if (!memmap)
> > -             return -ENOMEM;
> > -     usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
> > -     if (!usage) {
> > -             depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
> > -             return -ENOMEM;
> > -     }
> >
> > -     ms = __pfn_to_section(start_pfn);
> > -     if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
> > -             ret = -EEXIST;
> > -             goto out;
> > -     }
> > +     memmap = section_activate(nid, start_pfn, nr_pages, altmap);
> > +     if (IS_ERR(memmap))
> > +             return PTR_ERR(memmap);
> > +     ret = 0;
> >
> >       /*
> >        * Poison uninitialized struct pages in order to catch invalid flags
> >        * combinations.
> >        */
> > -     page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
> > +     page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
> >
> > +     ms = __pfn_to_section(start_pfn);
> >       section_mark_present(ms);
> > -     sparse_init_one_section(ms, section_nr, memmap, usage);
> > +     sparse_init_one_section(ms, section_nr, memmap, ms->usage);
> >
> > -out:
> > -     if (ret < 0) {
> > -             kfree(usage);
> > -             depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
> > -     }
> > +     if (ret < 0)
> > +             section_deactivate(start_pfn, nr_pages, nid, altmap);
> >       return ret;
> >  }
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 34f322d14e62..daeb2d7d8dd0 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -900,13 +900,12 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
>         int ret;
>
>         ret = sparse_index_init(section_nr, nid);
> -       if (ret < 0 && ret != -EEXIST)
> +       if (ret < 0)
>                 return ret;
>
>         memmap = section_activate(nid, start_pfn, nr_pages, altmap);
>         if (IS_ERR(memmap))
>                 return PTR_ERR(memmap);
> -       ret = 0;
>
>         /*
>          * Poison uninitialized struct pages in order to catch invalid flags
> @@ -918,8 +917,6 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
>         section_mark_present(ms);
>         sparse_init_one_section(ms, section_nr, memmap, ms->usage);
>
> -       if (ret < 0)
> -               section_deactivate(start_pfn, nr_pages, nid, altmap);
>         return ret;
>  }

Done.
diff mbox series

Patch

diff --git a/mm/sparse.c b/mm/sparse.c
index 8867f8901ee2..34f322d14e62 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,8 +83,15 @@  static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 
+	/*
+	 * An existing section is possible in the sub-section hotplug
+	 * case. First hot-add instantiates, follow-on hot-add reuses
+	 * the existing section.
+	 *
+	 * The mem_hotplug_lock resolves the apparent race below.
+	 */
 	if (mem_section[root])
-		return -EEXIST;
+		return 0;
 
 	section = sparse_index_alloc(nid);
 	if (!section)
@@ -210,6 +217,15 @@  static inline unsigned long first_present_section_nr(void)
 	return next_present_section_nr(-1);
 }
 
+void subsection_mask_set(unsigned long *map, unsigned long pfn,
+		unsigned long nr_pages)
+{
+	int idx = subsection_map_index(pfn);
+	int end = subsection_map_index(pfn + nr_pages - 1);
+
+	bitmap_set(map, idx, end - idx + 1);
+}
+
 void subsection_map_init(unsigned long pfn, unsigned long nr_pages)
 {
 	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
@@ -219,20 +235,17 @@  void subsection_map_init(unsigned long pfn, unsigned long nr_pages)
 		return;
 
 	for (i = start_sec; i <= end_sec; i++) {
-		int idx, end;
-		unsigned long pfns;
 		struct mem_section *ms;
+		unsigned long pfns;
 
-		idx = subsection_map_index(pfn);
 		pfns = min(nr_pages, PAGES_PER_SECTION
 				- (pfn & ~PAGE_SECTION_MASK));
-		end = subsection_map_index(pfn + pfns - 1);
-
 		ms = __nr_to_section(i);
-		bitmap_set(ms->usage->subsection_map, idx, end - idx + 1);
+		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
 
 		pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
-				pfns, idx, end - idx + 1);
+				pfns, subsection_map_index(pfn),
+				subsection_map_index(pfn + pfns - 1));
 
 		pfn += pfns;
 		nr_pages -= pfns;
@@ -319,6 +332,15 @@  static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
 		struct mem_section_usage *usage)
 {
+	/*
+	 * Given that SPARSEMEM_VMEMMAP=y supports sub-section hotplug,
+	 * ->section_mem_map can not be guaranteed to point to a full
+	 *  section's worth of memory.  The field is only valid / used
+	 *  in the SPARSEMEM_VMEMMAP=n case.
+	 */
+	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+		mem_map = NULL;
+
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
 							SECTION_HAS_MEM_MAP;
@@ -724,10 +746,142 @@  static void free_map_bootmem(struct page *memmap)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+#ifndef CONFIG_MEMORY_HOTREMOVE
+static void free_map_bootmem(struct page *memmap)
+{
+}
+#endif
+
+static bool is_early_section(struct mem_section *ms)
+{
+	struct page *usage_page;
+
+	usage_page = virt_to_page(ms->usage);
+	if (PageSlab(usage_page) || PageCompound(usage_page))
+		return false;
+	else
+		return true;
+}
+
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+		int nid, struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	bool early_section = is_early_section(ms);
+	struct page *memmap = NULL;
+	unsigned long *subsection_map = ms->usage
+		? &ms->usage->subsection_map[0] : NULL;
+
+	subsection_mask_set(map, pfn, nr_pages);
+	if (subsection_map)
+		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+				"section already deactivated (%#lx + %ld)\n",
+				pfn, nr_pages))
+		return;
+
+	if (WARN(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)
+				&& nr_pages < PAGES_PER_SECTION,
+				"partial memory section removal not supported\n"))
+		return;
+
+	/*
+	 * There are 3 cases to handle across two configurations
+	 * (SPARSEMEM_VMEMMAP={y,n}):
+	 *
+	 * 1/ deactivation of a partial hot-added section (only possible
+	 * in the SPARSEMEM_VMEMMAP=y case).
+	 *    a/ section was present at memory init
+	 *    b/ section was hot-added post memory init
+	 * 2/ deactivation of a complete hot-added section
+	 * 3/ deactivation of a complete section from memory init
+	 *
+	 * For 1/, when subsection_map does not empty we will not be
+	 * freeing the usage map, but still need to free the vmemmap
+	 * range.
+	 *
+	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+	 */
+	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+		unsigned long section_nr = pfn_to_section_nr(pfn);
+
+		if (!early_section) {
+			kfree(ms->usage);
+			ms->usage = NULL;
+		}
+		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+	}
+
+	if (early_section && memmap)
+		free_map_bootmem(memmap);
+	else
+		depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	struct mem_section_usage *usage = NULL;
+	unsigned long *subsection_map;
+	struct page *memmap;
+	int rc = 0;
+
+	subsection_mask_set(map, pfn, nr_pages);
+
+	if (!ms->usage) {
+		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+		if (!usage)
+			return ERR_PTR(-ENOMEM);
+		ms->usage = usage;
+	}
+	subsection_map = &ms->usage->subsection_map[0];
+
+	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+		rc = -EINVAL;
+	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+		rc = -EEXIST;
+	else
+		bitmap_or(subsection_map, map, subsection_map,
+				SUBSECTIONS_PER_SECTION);
+
+	if (rc) {
+		if (usage)
+			ms->usage = NULL;
+		kfree(usage);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The early init code does not consider partially populated
+	 * initial sections, it simply assumes that memory will never be
+	 * referenced.  If we hot-add memory into such a section then we
+	 * do not need to populate the memmap and can simply reuse what
+	 * is already there.
+	 */
+	if (nr_pages < PAGES_PER_SECTION && is_early_section(ms))
+		return pfn_to_page(pfn);
+
+	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+	if (!memmap) {
+		section_deactivate(pfn, nr_pages, nid, altmap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return memmap;
+}
+
 /**
- * sparse_add_one_section - add a memory section
+ * sparse_add_section - add a memory section, or populate an existing one
  * @nid: The node to add section on
  * @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
  * @altmap: device page map
  *
  * This is only intended for hotplug.
@@ -741,49 +895,31 @@  int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
 	int ret;
 
-	/*
-	 * no locking for this, because it does its own
-	 * plus, it does a kmalloc
-	 */
 	ret = sparse_index_init(section_nr, nid);
 	if (ret < 0 && ret != -EEXIST)
 		return ret;
-	ret = 0;
-	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
-			altmap);
-	if (!memmap)
-		return -ENOMEM;
-	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
-	if (!usage) {
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-		return -ENOMEM;
-	}
 
-	ms = __pfn_to_section(start_pfn);
-	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
-		ret = -EEXIST;
-		goto out;
-	}
+	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+	if (IS_ERR(memmap))
+		return PTR_ERR(memmap);
+	ret = 0;
 
 	/*
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
+	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
 
+	ms = __pfn_to_section(start_pfn);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage);
+	sparse_init_one_section(ms, section_nr, memmap, ms->usage);
 
-out:
-	if (ret < 0) {
-		kfree(usage);
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-	}
+	if (ret < 0)
+		section_deactivate(start_pfn, nr_pages, nid, altmap);
 	return ret;
 }
 
@@ -818,54 +954,13 @@  static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct page *memmap,
-		struct mem_section_usage *usage, unsigned long pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap)
-{
-	struct page *usage_page;
-
-	if (!usage)
-		return;
-
-	usage_page = virt_to_page(usage);
-	/*
-	 * Check to see if allocation came from hot-plug-add
-	 */
-	if (PageSlab(usage_page) || PageCompound(usage_page)) {
-		kfree(usage);
-		if (memmap)
-			depopulate_section_memmap(pfn, nr_pages, altmap);
-		return;
-	}
-
-	/*
-	 * The usemap came from bootmem. This is packed with other usemaps
-	 * on the section which has pgdat at boot time. Just keep it as is now.
-	 */
-
-	if (memmap)
-		free_map_bootmem(memmap);
-}
-
 void sparse_remove_section(struct zone *zone, struct mem_section *ms,
 		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap)
 {
-	struct page *memmap = NULL;
-	struct mem_section_usage *usage = NULL;
-
-	if (ms->section_mem_map) {
-		usage = ms->usage;
-		memmap = sparse_decode_mem_map(ms->section_mem_map,
-						__section_nr(ms));
-		ms->section_mem_map = 0;
-		ms->usage = NULL;
-	}
-
-	clear_hwpoisoned_pages(memmap + map_offset,
-			PAGES_PER_SECTION - map_offset);
-	free_section_usage(memmap, usage, section_nr_to_pfn(__section_nr(ms)),
-			PAGES_PER_SECTION, altmap);
+	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+			nr_pages - map_offset);
+	section_deactivate(pfn, nr_pages, zone_to_nid(zone), altmap);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */