diff mbox series

[v2,4/5] mm, memory-hotplug: Rework unregister_mem_sect_under_nodes

Message ID 20181127162005.15833-5-osalvador@suse.de (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Oscar Salvador Nov. 27, 2018, 4:20 p.m. UTC
From: Oscar Salvador <osalvador@suse.com>

This tries to address another issue about accessing
unitiliazed pages.

Jonathan reported a problem [1] where we can access steal pages
in case we hot-remove memory without onlining it first.

This time is in unregister_mem_sect_under_nodes.
This function tries to get the nid from the pfn and then
tries to remove the symlink between mem_blk <-> nid and vice versa.

Since we already know the nid in remove_memory(), we can pass
it down the chain to unregister_mem_sect_under_nodes.
There we can just remove the symlinks without the need
to look into the pages.

This also allows us to cleanup unregister_mem_sect_under_nodes.

[1] https://www.spinics.net/lists/linux-mm/msg161316.html

Signed-off-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/base/memory.c  |  9 ++++-----
 drivers/base/node.c    | 39 ++++++---------------------------------
 include/linux/memory.h |  2 +-
 include/linux/node.h   |  9 ++++-----
 mm/memory_hotplug.c    |  2 +-
 5 files changed, 16 insertions(+), 45 deletions(-)

Comments

Anshuman Khandual March 24, 2019, 6:48 a.m. UTC | #1
On 11/27/2018 09:50 PM, Oscar Salvador wrote:
> From: Oscar Salvador <osalvador@suse.com>
> 
> This tries to address another issue about accessing
> unitiliazed pages.
> 
> Jonathan reported a problem [1] where we can access steal pages
> in case we hot-remove memory without onlining it first.
> 
> This time is in unregister_mem_sect_under_nodes.
> This function tries to get the nid from the pfn and then
> tries to remove the symlink between mem_blk <-> nid and vice versa.
> 
> Since we already know the nid in remove_memory(), we can pass
> it down the chain to unregister_mem_sect_under_nodes.
> There we can just remove the symlinks without the need
> to look into the pages.
> 
> This also allows us to cleanup unregister_mem_sect_under_nodes.
> 
> [1] https://www.spinics.net/lists/linux-mm/msg161316.html
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> ---
>  drivers/base/memory.c  |  9 ++++-----
>  drivers/base/node.c    | 39 ++++++---------------------------------
>  include/linux/memory.h |  2 +-
>  include/linux/node.h   |  9 ++++-----
>  mm/memory_hotplug.c    |  2 +-
>  5 files changed, 16 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 0e5985682642..3d8c65d84bea 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -744,8 +744,7 @@ unregister_memory(struct memory_block *memory)
>  	device_unregister(&memory->dev);
>  }
>  
> -static int remove_memory_section(unsigned long node_id,
> -			       struct mem_section *section, int phys_device)
> +static int remove_memory_section(unsigned long nid, struct mem_section *section)
>  {
>  	struct memory_block *mem;
>  
> @@ -759,7 +758,7 @@ static int remove_memory_section(unsigned long node_id,
>  	if (!mem)
>  		goto out_unlock;
>  
> -	unregister_mem_sect_under_nodes(mem, __section_nr(section));
> +	unregister_mem_sect_under_nodes(nid, mem);
>  
>  	mem->section_count--;
>  	if (mem->section_count == 0)
> @@ -772,12 +771,12 @@ static int remove_memory_section(unsigned long node_id,
>  	return 0;
>  }
>  
> -int unregister_memory_section(struct mem_section *section)
> +int unregister_memory_section(int nid, struct mem_section *section)
>  {
>  	if (!present_section(section))
>  		return -EINVAL;
>  
> -	return remove_memory_section(0, section, 0);
> +	return remove_memory_section(nid, section);
>  }
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 86d6cd92ce3d..0858f7f3c7cd 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -453,40 +453,13 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
>  	return 0;
>  }
>  
> -/* unregister memory section under all nodes that it spans */
> -int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
> -				    unsigned long phys_index)
> +/* Remove symlink between node <-> mem_blk */
> +void unregister_mem_sect_under_nodes(int nid, struct memory_block *mem_blk)
>  {
> -	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
> -	unsigned long pfn, sect_start_pfn, sect_end_pfn;
> -
> -	if (!mem_blk) {
> -		NODEMASK_FREE(unlinked_nodes);
> -		return -EFAULT;
> -	}
> -	if (!unlinked_nodes)
> -		return -ENOMEM;
> -	nodes_clear(*unlinked_nodes);
> -
> -	sect_start_pfn = section_nr_to_pfn(phys_index);
> -	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
> -	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> -		int nid;
> -
> -		nid = get_nid_for_pfn(pfn);
> -		if (nid < 0)
> -			continue;
> -		if (!node_online(nid))
> -			continue;
> -		if (node_test_and_set(nid, *unlinked_nodes))
> -			continue;
> -		sysfs_remove_link(&node_devices[nid]->dev.kobj,
> -			 kobject_name(&mem_blk->dev.kobj));
> -		sysfs_remove_link(&mem_blk->dev.kobj,
> -			 kobject_name(&node_devices[nid]->dev.kobj));
> -	}
> -	NODEMASK_FREE(unlinked_nodes);
> -	return 0;
> +	sysfs_remove_link(&node_devices[nid]->dev.kobj,
> +			kobject_name(&mem_blk->dev.kobj));
> +	sysfs_remove_link(&mem_blk->dev.kobj,
> +			kobject_name(&node_devices[nid]->dev.kobj));

Hello Oscar,

Passing down node ID till unregister_mem_sect_under_nodes() solves the problem of
querying struct page for nid but the current code assumes that the pfn range for
any given memory section can have different node IDs. Hence it scans over the
section and try to remove all possible node <---> memory block sysfs links.

I am just wondering is that assumption even correct ? Can we really have a memory
section which belongs to different nodes ? Is that even possible.

- Anshuman
Oscar Salvador March 25, 2019, 7:40 a.m. UTC | #2
On Sun, Mar 24, 2019 at 12:18:26PM +0530, Anshuman Khandual wrote:
> Hello Oscar,

Hi Anshuman,

> Passing down node ID till unregister_mem_sect_under_nodes() solves the problem of
> querying struct page for nid but the current code assumes that the pfn range for
> any given memory section can have different node IDs. Hence it scans over the
> section and try to remove all possible node <---> memory block sysfs links.
> 
> I am just wondering is that assumption even correct ? Can we really have a memory
> section which belongs to different nodes ? Is that even possible.

Yes, current code assumes that, but looking at when we init sections at boot
stage, it seems like a 1:1 map to me.

E.g, in memory_present(), we do encode the nid in section's section_mem_map
field to use that later on in sparse_init(), and get the node we should allocate
the data structures from.

And in memory_present() itself, in case we do not use page's flags field,
we end up using the section_to_node_table[] table, which is clearly a 1:1 map.

So, I might be wrong here, but I think that we do not really have nodes mixed
in a section.
Michal Hocko March 25, 2019, 8:04 a.m. UTC | #3
On Mon 25-03-19 08:40:32, Oscar Salvador wrote:
> On Sun, Mar 24, 2019 at 12:18:26PM +0530, Anshuman Khandual wrote:
> > Hello Oscar,
> 
> Hi Anshuman,
> 
> > Passing down node ID till unregister_mem_sect_under_nodes() solves the problem of
> > querying struct page for nid but the current code assumes that the pfn range for
> > any given memory section can have different node IDs. Hence it scans over the
> > section and try to remove all possible node <---> memory block sysfs links.
> > 
> > I am just wondering is that assumption even correct ? Can we really have a memory
> > section which belongs to different nodes ? Is that even possible.
> 
> Yes, current code assumes that, but looking at when we init sections at boot
> stage, it seems like a 1:1 map to me.
> 
> E.g, in memory_present(), we do encode the nid in section's section_mem_map
> field to use that later on in sparse_init(), and get the node we should allocate
> the data structures from.
> 
> And in memory_present() itself, in case we do not use page's flags field,
> we end up using the section_to_node_table[] table, which is clearly a 1:1 map.
> 
> So, I might be wrong here, but I think that we do not really have nodes mixed
> in a section.

No, unfortunately two nodes might share the same section indeed. Have a
look at 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a
Oscar Salvador March 25, 2019, 8:14 a.m. UTC | #4
On Mon, Mar 25, 2019 at 09:04:53AM +0100, Michal Hocko wrote:
> No, unfortunately two nodes might share the same section indeed. Have a
> look at 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a

Heh, I see, I guess one cannot assume anything when it comes to HW.
Well, at least we know that for real now.

I always thought that this was merely a kind of hardcoded assumption that we
just made in an attempt to avoid breaking things.
diff mbox series

Patch

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0e5985682642..3d8c65d84bea 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -744,8 +744,7 @@  unregister_memory(struct memory_block *memory)
 	device_unregister(&memory->dev);
 }
 
-static int remove_memory_section(unsigned long node_id,
-			       struct mem_section *section, int phys_device)
+static int remove_memory_section(unsigned long nid, struct mem_section *section)
 {
 	struct memory_block *mem;
 
@@ -759,7 +758,7 @@  static int remove_memory_section(unsigned long node_id,
 	if (!mem)
 		goto out_unlock;
 
-	unregister_mem_sect_under_nodes(mem, __section_nr(section));
+	unregister_mem_sect_under_nodes(nid, mem);
 
 	mem->section_count--;
 	if (mem->section_count == 0)
@@ -772,12 +771,12 @@  static int remove_memory_section(unsigned long node_id,
 	return 0;
 }
 
-int unregister_memory_section(struct mem_section *section)
+int unregister_memory_section(int nid, struct mem_section *section)
 {
 	if (!present_section(section))
 		return -EINVAL;
 
-	return remove_memory_section(0, section, 0);
+	return remove_memory_section(nid, section);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 86d6cd92ce3d..0858f7f3c7cd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -453,40 +453,13 @@  int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 	return 0;
 }
 
-/* unregister memory section under all nodes that it spans */
-int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-				    unsigned long phys_index)
+/* Remove symlink between node <-> mem_blk */
+void unregister_mem_sect_under_nodes(int nid, struct memory_block *mem_blk)
 {
-	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
-	unsigned long pfn, sect_start_pfn, sect_end_pfn;
-
-	if (!mem_blk) {
-		NODEMASK_FREE(unlinked_nodes);
-		return -EFAULT;
-	}
-	if (!unlinked_nodes)
-		return -ENOMEM;
-	nodes_clear(*unlinked_nodes);
-
-	sect_start_pfn = section_nr_to_pfn(phys_index);
-	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
-	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
-		int nid;
-
-		nid = get_nid_for_pfn(pfn);
-		if (nid < 0)
-			continue;
-		if (!node_online(nid))
-			continue;
-		if (node_test_and_set(nid, *unlinked_nodes))
-			continue;
-		sysfs_remove_link(&node_devices[nid]->dev.kobj,
-			 kobject_name(&mem_blk->dev.kobj));
-		sysfs_remove_link(&mem_blk->dev.kobj,
-			 kobject_name(&node_devices[nid]->dev.kobj));
-	}
-	NODEMASK_FREE(unlinked_nodes);
-	return 0;
+	sysfs_remove_link(&node_devices[nid]->dev.kobj,
+			kobject_name(&mem_blk->dev.kobj));
+	sysfs_remove_link(&mem_blk->dev.kobj,
+			kobject_name(&node_devices[nid]->dev.kobj));
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a6ddefc60517..d75ec88ca09d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -113,7 +113,7 @@  extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int hotplug_memory_register(int nid, struct mem_section *section);
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern int unregister_memory_section(struct mem_section *);
+extern int unregister_memory_section(int nid, struct mem_section *);
 #endif
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/node.h b/include/linux/node.h
index 257bb3d6d014..488c1333bb06 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -72,8 +72,8 @@  extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-					   unsigned long phys_index);
+extern void unregister_mem_sect_under_nodes(int nid,
+			struct memory_block *mem_blk);
 
 #ifdef CONFIG_HUGETLBFS
 extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
@@ -105,10 +105,9 @@  static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-						  unsigned long phys_index)
+static inline void unregister_mem_sect_under_nodes(int nid,
+				struct memory_block *mem_blk)
 {
-	return 0;
 }
 
 static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4fe42ccb0be4..49b91907e19e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -544,7 +544,7 @@  static int __remove_section(int nid, struct mem_section *ms,
 	if (!valid_section(ms))
 		return ret;
 
-	ret = unregister_memory_section(ms);
+	ret = unregister_memory_section(nid, ms);
 	if (ret)
 		return ret;