diff mbox series

[v2] drivers/base/node.c: Simplify unregister_memory_block_under_nodes()

Message ID 20190719135244.15242-1-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2] drivers/base/node.c: Simplify unregister_memory_block_under_nodes() | expand

Commit Message

David Hildenbrand July 19, 2019, 1:52 p.m. UTC
We don't allow to offline memory block devices that belong to multiple
numa nodes. Therefore, such devices can never get removed. It is
sufficient to process a single node when removing the memory block. No
need to iterate over each and every PFN.

We already have the nid stored for each memory block. Make sure that
the nid always has a sane value.

Please note that checking for node_online(nid) is not required. If we
would have a memory block belonging to a node that is no longer offline,
then we would have a BUG in the node offlining code.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: David Hildenbrand <david@redhat.com>
---

v1 -> v2:
- Remove the "mixed nid" part, add a comment instead. Drop the warning.

---
 drivers/base/memory.c |  1 +
 drivers/base/node.c   | 39 +++++++++++++++------------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

Comments

David Hildenbrand Nov. 27, 2019, 4:53 p.m. UTC | #1
On 19.07.19 15:52, David Hildenbrand wrote:
> We don't allow to offline memory block devices that belong to multiple
> numa nodes. Therefore, such devices can never get removed. It is
> sufficient to process a single node when removing the memory block. No
> need to iterate over each and every PFN.
> 
> We already have the nid stored for each memory block. Make sure that
> the nid always has a sane value.
> 
> Please note that checking for node_online(nid) is not required. If we
> would have a memory block belonging to a node that is no longer offline,
> then we would have a BUG in the node offlining code.
> 
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
> 
> v1 -> v2:
> - Remove the "mixed nid" part, add a comment instead. Drop the warning.
> 
> ---
>  drivers/base/memory.c |  1 +
>  drivers/base/node.c   | 39 +++++++++++++++------------------------
>  2 files changed, 16 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 20c39d1bcef8..154d5d4a0779 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory,
>  	mem->state = state;
>  	start_pfn = section_nr_to_pfn(mem->start_section_nr);
>  	mem->phys_device = arch_get_memory_phys_device(start_pfn);
> +	mem->nid = NUMA_NO_NODE;
>  
>  	ret = register_memory(mem);
>  
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 75b7e6f6535b..840c95baa1d8 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  	int ret, nid = *(int *)arg;
>  	unsigned long pfn, sect_start_pfn, sect_end_pfn;
>  
> -	mem_blk->nid = nid;
> -
>  	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
>  	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>  	sect_end_pfn += PAGES_PER_SECTION - 1;
> @@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  			if (page_nid != nid)
>  				continue;
>  		}
> +
> +		/*
> +		 * If this memory block spans multiple nodes, we only indicate
> +		 * the last processed node.
> +		 */
> +		mem_blk->nid = nid;
> +
>  		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
>  					&mem_blk->dev.kobj,
>  					kobject_name(&mem_blk->dev.kobj));
> @@ -804,32 +809,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  }
>  
>  /*
> - * Unregister memory block device under all nodes that it spans.
> - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
> + * Unregister a memory block device under the node it spans. Memory blocks
> + * with multiple nodes cannot be offlined and therefore also never be removed.
>   */
>  void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
>  {
> -	unsigned long pfn, sect_start_pfn, sect_end_pfn;
> -	static nodemask_t unlinked_nodes;
> -
> -	nodes_clear(unlinked_nodes);
> -	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
> -	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
> -	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> -		int nid;
> +	if (mem_blk->nid == NUMA_NO_NODE)
> +		return;
>  
> -		nid = get_nid_for_pfn(pfn);
> -		if (nid < 0)
> -			continue;
> -		if (!node_online(nid))
> -			continue;
> -		if (node_test_and_set(nid, unlinked_nodes))
> -			continue;
> -		sysfs_remove_link(&node_devices[nid]->dev.kobj,
> -			 kobject_name(&mem_blk->dev.kobj));
> -		sysfs_remove_link(&mem_blk->dev.kobj,
> -			 kobject_name(&node_devices[nid]->dev.kobj));
> -	}
> +	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
> +			  kobject_name(&mem_blk->dev.kobj));
> +	sysfs_remove_link(&mem_blk->dev.kobj,
> +			  kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
>  }
>  
>  int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
> 


Just a note that this was actually also a bugfix as noted by Chris.

If the memory we are removing was never onlined,
get_nid_for_pfn()->pfn_to_nid() will return garbage. Removing will
succeed but links will remain in place.

Can be triggered by

1. hotplugging a DIMM to node 1
2. not onlining the memory blocks
3. unplugging it
4. re-plugging it to node 1

We will trigger the BUG_ON(ret) in add_memory_resource(), because
link_mem_sections() will return with -EEXIST.
Andrew Morton Nov. 27, 2019, 10:15 p.m. UTC | #2
On Wed, 27 Nov 2019 17:53:12 +0100 David Hildenbrand <david@redhat.com> wrote:

> Just a note that this was actually also a bugfix as noted by Chris.
> 
> If the memory we are removing was never onlined,
> get_nid_for_pfn()->pfn_to_nid() will return garbage. Removing will
> succeed but links will remain in place.
> 
> Can be triggered by
> 
> 1. hotplugging a DIMM to node 1
> 2. not onlining the memory blocks
> 3. unplugging it
> 4. re-plugging it to node 1
> 
> We will trigger the BUG_ON(ret) in add_memory_resource(), because
> link_mem_sections() will return with -EEXIST.

Oh.  In that case case we please redo the patch as a bugfix? 
Appropriate title and changelog?  And perhaps the bugfix can be split
from the cleanup, to make the former more backportable?
David Hildenbrand Nov. 27, 2019, 10:55 p.m. UTC | #3
> Am 27.11.2019 um 23:15 schrieb Andrew Morton <akpm@linux-foundation.org>:
> 
> On Wed, 27 Nov 2019 17:53:12 +0100 David Hildenbrand <david@redhat.com> wrote:
> 
>> Just a note that this was actually also a bugfix as noted by Chris.
>> 
>> If the memory we are removing was never onlined,
>> get_nid_for_pfn()->pfn_to_nid() will return garbage. Removing will
>> succeed but links will remain in place.
>> 
>> Can be triggered by
>> 
>> 1. hotplugging a DIMM to node 1
>> 2. not onlining the memory blocks
>> 3. unplugging it
>> 4. re-plugging it to node 1
>> 
>> We will trigger the BUG_ON(ret) in add_memory_resource(), because
>> link_mem_sections() will return with -EEXIST.
> 
> Oh.  In that case case we please redo the patch as a bugfix? 
> Appropriate title and changelog?  And perhaps the bugfix can be split
> from the cleanup, to make the former more backportable?

This is already upstream (d84f2f5a7552 ),so I‘m afraid we can‘t do anything about it. (When your cleanups turn into bugfixes ...).

I can still try to send stable patches, though ...
diff mbox series

Patch

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 20c39d1bcef8..154d5d4a0779 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -674,6 +674,7 @@  static int init_memory_block(struct memory_block **memory,
 	mem->state = state;
 	start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	mem->phys_device = arch_get_memory_phys_device(start_pfn);
+	mem->nid = NUMA_NO_NODE;
 
 	ret = register_memory(mem);
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 75b7e6f6535b..840c95baa1d8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -759,8 +759,6 @@  static int register_mem_sect_under_node(struct memory_block *mem_blk,
 	int ret, nid = *(int *)arg;
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
 
-	mem_blk->nid = nid;
-
 	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
 	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	sect_end_pfn += PAGES_PER_SECTION - 1;
@@ -789,6 +787,13 @@  static int register_mem_sect_under_node(struct memory_block *mem_blk,
 			if (page_nid != nid)
 				continue;
 		}
+
+		/*
+		 * If this memory block spans multiple nodes, we only indicate
+		 * the last processed node.
+		 */
+		mem_blk->nid = nid;
+
 		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
 					&mem_blk->dev.kobj,
 					kobject_name(&mem_blk->dev.kobj));
@@ -804,32 +809,18 @@  static int register_mem_sect_under_node(struct memory_block *mem_blk,
 }
 
 /*
- * Unregister memory block device under all nodes that it spans.
- * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
  */
 void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	unsigned long pfn, sect_start_pfn, sect_end_pfn;
-	static nodemask_t unlinked_nodes;
-
-	nodes_clear(unlinked_nodes);
-	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
-	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
-	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
-		int nid;
+	if (mem_blk->nid == NUMA_NO_NODE)
+		return;
 
-		nid = get_nid_for_pfn(pfn);
-		if (nid < 0)
-			continue;
-		if (!node_online(nid))
-			continue;
-		if (node_test_and_set(nid, unlinked_nodes))
-			continue;
-		sysfs_remove_link(&node_devices[nid]->dev.kobj,
-			 kobject_name(&mem_blk->dev.kobj));
-		sysfs_remove_link(&mem_blk->dev.kobj,
-			 kobject_name(&node_devices[nid]->dev.kobj));
-	}
+	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
+			  kobject_name(&mem_blk->dev.kobj));
+	sysfs_remove_link(&mem_blk->dev.kobj,
+			  kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)