[v1] drivers/base/node.c: Simplify unregister_memory_block_under_nodes()
diff mbox series

Message ID 20190718142239.7205-1-david@redhat.com
State New
Headers show
Series
  • [v1] drivers/base/node.c: Simplify unregister_memory_block_under_nodes()
Related show

Commit Message

David Hildenbrand July 18, 2019, 2:22 p.m. UTC
We don't allow to offline memory block devices that belong to multiple
numa nodes. Therefore, such devices can never get removed. It is
sufficient to process a single node when removing the memory block.

Remember for each memory block if it belongs to no, a single, or mixed
nodes, so we can use that information to skip unregistering or print a
warning (essentially a safety net to catch BUGs).

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/base/memory.c  |  1 +
 drivers/base/node.c    | 40 ++++++++++++++++------------------------
 include/linux/memory.h |  4 +++-
 3 files changed, 20 insertions(+), 25 deletions(-)

Comments

David Hildenbrand July 19, 2019, 8:14 a.m. UTC | #1
On 18.07.19 16:22, David Hildenbrand wrote:
> We don't allow to offline memory block devices that belong to multiple
> numa nodes. Therefore, such devices can never get removed. It is
> sufficient to process a single node when removing the memory block.
> 
> Remember for each memory block if it belongs to no, a single, or mixed
> nodes, so we can use that information to skip unregistering or print a
> warning (essentially a safety net to catch BUGs).
> 
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  drivers/base/memory.c  |  1 +
>  drivers/base/node.c    | 40 ++++++++++++++++------------------------
>  include/linux/memory.h |  4 +++-
>  3 files changed, 20 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 20c39d1bcef8..154d5d4a0779 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory,
>  	mem->state = state;
>  	start_pfn = section_nr_to_pfn(mem->start_section_nr);
>  	mem->phys_device = arch_get_memory_phys_device(start_pfn);
> +	mem->nid = NUMA_NO_NODE;
>  
>  	ret = register_memory(mem);
>  
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 75b7e6f6535b..29d27b8d5fda 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  	int ret, nid = *(int *)arg;
>  	unsigned long pfn, sect_start_pfn, sect_end_pfn;
>  
> -	mem_blk->nid = nid;
> -
>  	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
>  	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>  	sect_end_pfn += PAGES_PER_SECTION - 1;
> @@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  			if (page_nid != nid)
>  				continue;
>  		}
> +
> +		/* this memory block spans this node */
> +		if (mem_blk->nid == NUMA_NO_NODE)
> +			mem_blk->nid = nid;
> +		else
> +			mem_blk->nid = NUMA_NO_NODE - 1;
> +

Although I am not sure if it can happen, I think it is better to have

if (mem_blk->nid == NUMA_NO_NODE)
	mem_blk->nid = nid;
else if (mem_blk->nid != nid)
	mem_blk->nid = NUMA_NO_NODE - 1;
Michal Hocko July 19, 2019, 8:42 a.m. UTC | #2
On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
> We don't allow to offline memory block devices that belong to multiple
> numa nodes. Therefore, such devices can never get removed. It is
> sufficient to process a single node when removing the memory block.
> 
> Remember for each memory block if it belongs to no, a single, or mixed
> nodes, so we can use that information to skip unregistering or print a
> warning (essentially a safety net to catch BUGs).

I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
node that is magic. Why should we even care? In other words why is this
patch an improvement?

> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  drivers/base/memory.c  |  1 +
>  drivers/base/node.c    | 40 ++++++++++++++++------------------------
>  include/linux/memory.h |  4 +++-
>  3 files changed, 20 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 20c39d1bcef8..154d5d4a0779 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory,
>  	mem->state = state;
>  	start_pfn = section_nr_to_pfn(mem->start_section_nr);
>  	mem->phys_device = arch_get_memory_phys_device(start_pfn);
> +	mem->nid = NUMA_NO_NODE;
>  
>  	ret = register_memory(mem);
>  
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 75b7e6f6535b..29d27b8d5fda 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  	int ret, nid = *(int *)arg;
>  	unsigned long pfn, sect_start_pfn, sect_end_pfn;
>  
> -	mem_blk->nid = nid;
> -
>  	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
>  	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>  	sect_end_pfn += PAGES_PER_SECTION - 1;
> @@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  			if (page_nid != nid)
>  				continue;
>  		}
> +
> +		/* this memory block spans this node */
> +		if (mem_blk->nid == NUMA_NO_NODE)
> +			mem_blk->nid = nid;
> +		else
> +			mem_blk->nid = NUMA_NO_NODE - 1;
> +
>  		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
>  					&mem_blk->dev.kobj,
>  					kobject_name(&mem_blk->dev.kobj));
> @@ -804,32 +809,19 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>  }
>  
>  /*
> - * Unregister memory block device under all nodes that it spans.
> - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
> + * Unregister a memory block device under the node it spans. Memory blocks
> + * with multiple nodes cannot be offlined and therefore also never be removed.
>   */
>  void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
>  {
> -	unsigned long pfn, sect_start_pfn, sect_end_pfn;
> -	static nodemask_t unlinked_nodes;
> -
> -	nodes_clear(unlinked_nodes);
> -	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
> -	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
> -	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> -		int nid;
> +	if (mem_blk->nid == NUMA_NO_NODE ||
> +	    WARN_ON_ONCE(mem_blk->nid == NUMA_NO_NODE - 1))
> +		return;
>  
> -		nid = get_nid_for_pfn(pfn);
> -		if (nid < 0)
> -			continue;
> -		if (!node_online(nid))
> -			continue;
> -		if (node_test_and_set(nid, unlinked_nodes))
> -			continue;
> -		sysfs_remove_link(&node_devices[nid]->dev.kobj,
> -			 kobject_name(&mem_blk->dev.kobj));
> -		sysfs_remove_link(&mem_blk->dev.kobj,
> -			 kobject_name(&node_devices[nid]->dev.kobj));
> -	}
> +	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
> +		 kobject_name(&mem_blk->dev.kobj));
> +	sysfs_remove_link(&mem_blk->dev.kobj,
> +		 kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
>  }
>  
>  int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
> diff --git a/include/linux/memory.h b/include/linux/memory.h
> index 02e633f3ede0..c91af10d5fb4 100644
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -33,7 +33,9 @@ struct memory_block {
>  	void *hw;			/* optional pointer to fw/hw data */
>  	int (*phys_callback)(struct memory_block *);
>  	struct device dev;
> -	int nid;			/* NID for this memory block */
> +	int nid;			/* NID for this memory block.
> +					   - NUMA_NO_NODE: uninitialized
> +					   - NUMA_NO_NODE - 1: mixed nodes */
>  };
>  
>  int arch_get_memory_phys_device(unsigned long start_pfn);
> -- 
> 2.21.0
David Hildenbrand July 19, 2019, 8:48 a.m. UTC | #3
On 19.07.19 10:42, Michal Hocko wrote:
> On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
>> We don't allow to offline memory block devices that belong to multiple
>> numa nodes. Therefore, such devices can never get removed. It is
>> sufficient to process a single node when removing the memory block.
>>
>> Remember for each memory block if it belongs to no, a single, or mixed
>> nodes, so we can use that information to skip unregistering or print a
>> warning (essentially a safety net to catch BUGs).
> 
> I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
> node that is magic. Why should we even care? In other words why is this
> patch an improvement?

I mean we can of course go ahead and drop the "NUMA_NO_NODE - 1" thingy
from the patch. A memory block with multiple nodes would (as of now)
only indicate one of the nodes.

Then there is simply no way to WARN_ON_ONCE() in case unexpected things
would happen. (I mean it really shouldn't happen or we have a BUG
somewhere else)

Alternative: Add "bool mixed_nids;" to "struct memory block".

> 
>> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
>> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>  drivers/base/memory.c  |  1 +
>>  drivers/base/node.c    | 40 ++++++++++++++++------------------------
>>  include/linux/memory.h |  4 +++-
>>  3 files changed, 20 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>> index 20c39d1bcef8..154d5d4a0779 100644
>> --- a/drivers/base/memory.c
>> +++ b/drivers/base/memory.c
>> @@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory,
>>  	mem->state = state;
>>  	start_pfn = section_nr_to_pfn(mem->start_section_nr);
>>  	mem->phys_device = arch_get_memory_phys_device(start_pfn);
>> +	mem->nid = NUMA_NO_NODE;
>>  
>>  	ret = register_memory(mem);
>>  
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index 75b7e6f6535b..29d27b8d5fda 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>>  	int ret, nid = *(int *)arg;
>>  	unsigned long pfn, sect_start_pfn, sect_end_pfn;
>>  
>> -	mem_blk->nid = nid;
>> -
>>  	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
>>  	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>>  	sect_end_pfn += PAGES_PER_SECTION - 1;
>> @@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>>  			if (page_nid != nid)
>>  				continue;
>>  		}
>> +
>> +		/* this memory block spans this node */
>> +		if (mem_blk->nid == NUMA_NO_NODE)
>> +			mem_blk->nid = nid;
>> +		else
>> +			mem_blk->nid = NUMA_NO_NODE - 1;
>> +
>>  		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
>>  					&mem_blk->dev.kobj,
>>  					kobject_name(&mem_blk->dev.kobj));
>> @@ -804,32 +809,19 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
>>  }
>>  
>>  /*
>> - * Unregister memory block device under all nodes that it spans.
>> - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
>> + * Unregister a memory block device under the node it spans. Memory blocks
>> + * with multiple nodes cannot be offlined and therefore also never be removed.
>>   */
>>  void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
>>  {
>> -	unsigned long pfn, sect_start_pfn, sect_end_pfn;
>> -	static nodemask_t unlinked_nodes;
>> -
>> -	nodes_clear(unlinked_nodes);
>> -	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
>> -	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>> -	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
>> -		int nid;
>> +	if (mem_blk->nid == NUMA_NO_NODE ||
>> +	    WARN_ON_ONCE(mem_blk->nid == NUMA_NO_NODE - 1))
>> +		return;
>>  
>> -		nid = get_nid_for_pfn(pfn);
>> -		if (nid < 0)
>> -			continue;
>> -		if (!node_online(nid))
>> -			continue;
>> -		if (node_test_and_set(nid, unlinked_nodes))
>> -			continue;
>> -		sysfs_remove_link(&node_devices[nid]->dev.kobj,
>> -			 kobject_name(&mem_blk->dev.kobj));
>> -		sysfs_remove_link(&mem_blk->dev.kobj,
>> -			 kobject_name(&node_devices[nid]->dev.kobj));
>> -	}
>> +	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
>> +		 kobject_name(&mem_blk->dev.kobj));
>> +	sysfs_remove_link(&mem_blk->dev.kobj,
>> +		 kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
>>  }
>>  
>>  int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
>> diff --git a/include/linux/memory.h b/include/linux/memory.h
>> index 02e633f3ede0..c91af10d5fb4 100644
>> --- a/include/linux/memory.h
>> +++ b/include/linux/memory.h
>> @@ -33,7 +33,9 @@ struct memory_block {
>>  	void *hw;			/* optional pointer to fw/hw data */
>>  	int (*phys_callback)(struct memory_block *);
>>  	struct device dev;
>> -	int nid;			/* NID for this memory block */
>> +	int nid;			/* NID for this memory block.
>> +					   - NUMA_NO_NODE: uninitialized
>> +					   - NUMA_NO_NODE - 1: mixed nodes */
>>  };
>>  
>>  int arch_get_memory_phys_device(unsigned long start_pfn);
>> -- 
>> 2.21.0
>
David Hildenbrand July 19, 2019, 9:05 a.m. UTC | #4
On 19.07.19 10:42, Michal Hocko wrote:
> On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
>> We don't allow to offline memory block devices that belong to multiple
>> numa nodes. Therefore, such devices can never get removed. It is
>> sufficient to process a single node when removing the memory block.
>>
>> Remember for each memory block if it belongs to no, a single, or mixed
>> nodes, so we can use that information to skip unregistering or print a
>> warning (essentially a safety net to catch BUGs).
> 
> I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
> node that is magic. Why should we even care? In other words why is this
> patch an improvement?

Oh, and to answer that part of the question:

We no longer have to iterate over each pfn of a memory block to be removed.
Michal Hocko July 19, 2019, 9:09 a.m. UTC | #5
On Fri 19-07-19 10:48:19, David Hildenbrand wrote:
> On 19.07.19 10:42, Michal Hocko wrote:
> > On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
> >> We don't allow to offline memory block devices that belong to multiple
> >> numa nodes. Therefore, such devices can never get removed. It is
> >> sufficient to process a single node when removing the memory block.
> >>
> >> Remember for each memory block if it belongs to no, a single, or mixed
> >> nodes, so we can use that information to skip unregistering or print a
> >> warning (essentially a safety net to catch BUGs).
> > 
> > I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
> > node that is magic. Why should we even care? In other words why is this
> > patch an improvement?
> 
> I mean we can of course go ahead and drop the "NUMA_NO_NODE - 1" thingy
> from the patch. A memory block with multiple nodes would (as of now)
> only indicate one of the nodes.

Yes and that seemed to work reasonably well so far. Sure there is a
potential confusion but platforms with interleaved nodes are rare enough
to somebody to even notice so far.

> Then there is simply no way to WARN_ON_ONCE() in case unexpected things
> would happen. (I mean it really shouldn't happen or we have a BUG
> somewhere else)

I do not really see much point to warn here. What can user potentially
do?

> Alternative: Add "bool mixed_nids;" to "struct memory block".

That would be certainly possible but do we actually care?
Michal Hocko July 19, 2019, 9:13 a.m. UTC | #6
On Fri 19-07-19 11:05:51, David Hildenbrand wrote:
> On 19.07.19 10:42, Michal Hocko wrote:
> > On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
> >> We don't allow to offline memory block devices that belong to multiple
> >> numa nodes. Therefore, such devices can never get removed. It is
> >> sufficient to process a single node when removing the memory block.
> >>
> >> Remember for each memory block if it belongs to no, a single, or mixed
> >> nodes, so we can use that information to skip unregistering or print a
> >> warning (essentially a safety net to catch BUGs).
> > 
> > I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
> > node that is magic. Why should we even care? In other words why is this
> > patch an improvement?
> 
> Oh, and to answer that part of the question:
> 
> We no longer have to iterate over each pfn of a memory block to be removed.

Is it possible that we are overzealous when unregistering syfs files and
we should simply skip the pfn walk even without this change?
David Hildenbrand July 19, 2019, 9:18 a.m. UTC | #7
On 19.07.19 11:09, Michal Hocko wrote:
> On Fri 19-07-19 10:48:19, David Hildenbrand wrote:
>> On 19.07.19 10:42, Michal Hocko wrote:
>>> On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
>>>> We don't allow to offline memory block devices that belong to multiple
>>>> numa nodes. Therefore, such devices can never get removed. It is
>>>> sufficient to process a single node when removing the memory block.
>>>>
>>>> Remember for each memory block if it belongs to no, a single, or mixed
>>>> nodes, so we can use that information to skip unregistering or print a
>>>> warning (essentially a safety net to catch BUGs).
>>>
>>> I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
>>> node that is magic. Why should we even care? In other words why is this
>>> patch an improvement?
>>
>> I mean we can of course go ahead and drop the "NUMA_NO_NODE - 1" thingy
>> from the patch. A memory block with multiple nodes would (as of now)
>> only indicate one of the nodes.
> 
> Yes and that seemed to work reasonably well so far. Sure there is a
> potential confusion but platforms with interleaved nodes are rare enough
> to somebody to even notice so far.

Let's hope there are no BUGs related to that and we just didn't catch
them yet because it's barely used :)

> 
>> Then there is simply no way to WARN_ON_ONCE() in case unexpected things
>> would happen. (I mean it really shouldn't happen or we have a BUG
>> somewhere else)
> 
> I do not really see much point to warn here. What can user potentially
> do?

We could detect this while testing and see that some other code seems to
do unexpected things (remove such memory blocks although not allowed).

> 
>> Alternative: Add "bool mixed_nids;" to "struct memory block".
> 
> That would be certainly possible but do we actually care?

Only if we want to warn. And I am fine with dropping this part.
David Hildenbrand July 19, 2019, 9:20 a.m. UTC | #8
On 19.07.19 11:13, Michal Hocko wrote:
> On Fri 19-07-19 11:05:51, David Hildenbrand wrote:
>> On 19.07.19 10:42, Michal Hocko wrote:
>>> On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
>>>> We don't allow to offline memory block devices that belong to multiple
>>>> numa nodes. Therefore, such devices can never get removed. It is
>>>> sufficient to process a single node when removing the memory block.
>>>>
>>>> Remember for each memory block if it belongs to no, a single, or mixed
>>>> nodes, so we can use that information to skip unregistering or print a
>>>> warning (essentially a safety net to catch BUGs).
>>>
>>> I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
>>> node that is magic. Why should we even care? In other words why is this
>>> patch an improvement?
>>
>> Oh, and to answer that part of the question:
>>
>> We no longer have to iterate over each pfn of a memory block to be removed.
> 
> Is it possible that we are overzealous when unregistering syfs files and
> we should simply skip the pfn walk even without this change?
> 

I assume you mean something like v1 without the warning/"NUMA_NO_NODE -1"?

See what I have right now below.


From 27e9b02146e5fbe8edac49767693fa18c9b204dd Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:48:41 +0200
Subject: [PATCH v2] drivers/base/node.c: Simplify
 unregister_memory_block_under_nodes()

We don't allow to offline memory block devices that belong to multiple
numa nodes. Therefore, such devices can never get removed. It is
sufficient to process a single node when removing the memory block.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/base/memory.c |  1 +
 drivers/base/node.c   | 39 +++++++++++++++------------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 20c39d1bcef8..154d5d4a0779 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory,
 	mem->state = state;
 	start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	mem->phys_device = arch_get_memory_phys_device(start_pfn);
+	mem->nid = NUMA_NO_NODE;
 
 	ret = register_memory(mem);
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 75b7e6f6535b..840c95baa1d8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
 	int ret, nid = *(int *)arg;
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
 
-	mem_blk->nid = nid;
-
 	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
 	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	sect_end_pfn += PAGES_PER_SECTION - 1;
@@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
 			if (page_nid != nid)
 				continue;
 		}
+
+		/*
+		 * If this memory block spans multiple nodes, we only indicate
+		 * the last processed node.
+		 */
+		mem_blk->nid = nid;
+
 		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
 					&mem_blk->dev.kobj,
 					kobject_name(&mem_blk->dev.kobj));
@@ -804,32 +809,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
 }
 
 /*
- * Unregister memory block device under all nodes that it spans.
- * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
  */
 void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	unsigned long pfn, sect_start_pfn, sect_end_pfn;
-	static nodemask_t unlinked_nodes;
-
-	nodes_clear(unlinked_nodes);
-	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
-	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
-	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
-		int nid;
+	if (mem_blk->nid == NUMA_NO_NODE)
+		return;
 
-		nid = get_nid_for_pfn(pfn);
-		if (nid < 0)
-			continue;
-		if (!node_online(nid))
-			continue;
-		if (node_test_and_set(nid, unlinked_nodes))
-			continue;
-		sysfs_remove_link(&node_devices[nid]->dev.kobj,
-			 kobject_name(&mem_blk->dev.kobj));
-		sysfs_remove_link(&mem_blk->dev.kobj,
-			 kobject_name(&node_devices[nid]->dev.kobj));
-	}
+	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
+			  kobject_name(&mem_blk->dev.kobj));
+	sysfs_remove_link(&mem_blk->dev.kobj,
+			  kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
Michal Hocko July 19, 2019, 11:36 a.m. UTC | #9
On Fri 19-07-19 11:20:43, David Hildenbrand wrote:
> On 19.07.19 11:13, Michal Hocko wrote:
> > On Fri 19-07-19 11:05:51, David Hildenbrand wrote:
> >> On 19.07.19 10:42, Michal Hocko wrote:
> >>> On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
> >>>> We don't allow to offline memory block devices that belong to multiple
> >>>> numa nodes. Therefore, such devices can never get removed. It is
> >>>> sufficient to process a single node when removing the memory block.
> >>>>
> >>>> Remember for each memory block if it belongs to no, a single, or mixed
> >>>> nodes, so we can use that information to skip unregistering or print a
> >>>> warning (essentially a safety net to catch BUGs).
> >>>
> >>> I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
> >>> node that is magic. Why should we even care? In other words why is this
> >>> patch an improvement?
> >>
> >> Oh, and to answer that part of the question:
> >>
> >> We no longer have to iterate over each pfn of a memory block to be removed.
> > 
> > Is it possible that we are overzealous when unregistering syfs files and
> > we should simply skip the pfn walk even without this change?
> > 
> 
> I assume you mean something like v1 without the warning/"NUMA_NO_NODE -1"?
> 
> See what I have right now below.

Yes. I didn'g get to look closely but you caught the idea. Thanks!
David Hildenbrand July 19, 2019, 11:42 a.m. UTC | #10
On 19.07.19 13:36, Michal Hocko wrote:
> On Fri 19-07-19 11:20:43, David Hildenbrand wrote:
>> On 19.07.19 11:13, Michal Hocko wrote:
>>> On Fri 19-07-19 11:05:51, David Hildenbrand wrote:
>>>> On 19.07.19 10:42, Michal Hocko wrote:
>>>>> On Thu 18-07-19 16:22:39, David Hildenbrand wrote:
>>>>>> We don't allow to offline memory block devices that belong to multiple
>>>>>> numa nodes. Therefore, such devices can never get removed. It is
>>>>>> sufficient to process a single node when removing the memory block.
>>>>>>
>>>>>> Remember for each memory block if it belongs to no, a single, or mixed
>>>>>> nodes, so we can use that information to skip unregistering or print a
>>>>>> warning (essentially a safety net to catch BUGs).
>>>>>
>>>>> I do not really like NUMA_NO_NODE - 1 thing. This is yet another invalid
>>>>> node that is magic. Why should we even care? In other words why is this
>>>>> patch an improvement?
>>>>
>>>> Oh, and to answer that part of the question:
>>>>
>>>> We no longer have to iterate over each pfn of a memory block to be removed.
>>>
>>> Is it possible that we are overzealous when unregistering syfs files and
>>> we should simply skip the pfn walk even without this change?
>>>
>>
>> I assume you mean something like v1 without the warning/"NUMA_NO_NODE -1"?
>>
>> See what I have right now below.
> 
> Yes. I didn'g get to look closely but you caught the idea. Thanks!
> 

Will do a quick test and resent later this day, thanks for having a look!

Patch
diff mbox series

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 20c39d1bcef8..154d5d4a0779 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -674,6 +674,7 @@  static int init_memory_block(struct memory_block **memory,
 	mem->state = state;
 	start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	mem->phys_device = arch_get_memory_phys_device(start_pfn);
+	mem->nid = NUMA_NO_NODE;
 
 	ret = register_memory(mem);
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 75b7e6f6535b..29d27b8d5fda 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -759,8 +759,6 @@  static int register_mem_sect_under_node(struct memory_block *mem_blk,
 	int ret, nid = *(int *)arg;
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
 
-	mem_blk->nid = nid;
-
 	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
 	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	sect_end_pfn += PAGES_PER_SECTION - 1;
@@ -789,6 +787,13 @@  static int register_mem_sect_under_node(struct memory_block *mem_blk,
 			if (page_nid != nid)
 				continue;
 		}
+
+		/* this memory block spans this node */
+		if (mem_blk->nid == NUMA_NO_NODE)
+			mem_blk->nid = nid;
+		else
+			mem_blk->nid = NUMA_NO_NODE - 1;
+
 		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
 					&mem_blk->dev.kobj,
 					kobject_name(&mem_blk->dev.kobj));
@@ -804,32 +809,19 @@  static int register_mem_sect_under_node(struct memory_block *mem_blk,
 }
 
 /*
- * Unregister memory block device under all nodes that it spans.
- * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
  */
 void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	unsigned long pfn, sect_start_pfn, sect_end_pfn;
-	static nodemask_t unlinked_nodes;
-
-	nodes_clear(unlinked_nodes);
-	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
-	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
-	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
-		int nid;
+	if (mem_blk->nid == NUMA_NO_NODE ||
+	    WARN_ON_ONCE(mem_blk->nid == NUMA_NO_NODE - 1))
+		return;
 
-		nid = get_nid_for_pfn(pfn);
-		if (nid < 0)
-			continue;
-		if (!node_online(nid))
-			continue;
-		if (node_test_and_set(nid, unlinked_nodes))
-			continue;
-		sysfs_remove_link(&node_devices[nid]->dev.kobj,
-			 kobject_name(&mem_blk->dev.kobj));
-		sysfs_remove_link(&mem_blk->dev.kobj,
-			 kobject_name(&node_devices[nid]->dev.kobj));
-	}
+	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
+		 kobject_name(&mem_blk->dev.kobj));
+	sysfs_remove_link(&mem_blk->dev.kobj,
+		 kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 02e633f3ede0..c91af10d5fb4 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -33,7 +33,9 @@  struct memory_block {
 	void *hw;			/* optional pointer to fw/hw data */
 	int (*phys_callback)(struct memory_block *);
 	struct device dev;
-	int nid;			/* NID for this memory block */
+	int nid;			/* NID for this memory block.
+					   - NUMA_NO_NODE: uninitialized
+					   - NUMA_NO_NODE - 1: mixed nodes */
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);