diff mbox series

[05/17] arch, mm: pull out allocation of NODE_DATA to generic code

Message ID 20240716111346.3676969-6-rppt@kernel.org (mailing list archive)
State New, archived
Headers show
Series mm: introduce numa_memblks | expand

Commit Message

Mike Rapoport July 16, 2024, 11:13 a.m. UTC
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Architectures that support NUMA duplicate the code that allocates
NODE_DATA on the node-local memory with slight variations in reporting
of the addresses where the memory was allocated.

Use x86 version as the basis for the generic alloc_node_data() function
and call this function in architecture specific numa initialization.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 arch/loongarch/kernel/numa.c | 18 ------------------
 arch/mips/loongson64/numa.c  | 16 ++--------------
 arch/powerpc/mm/numa.c       | 24 +++---------------------
 arch/sh/mm/init.c            |  7 +------
 arch/sparc/mm/init_64.c      |  9 ++-------
 arch/x86/mm/numa.c           | 34 +---------------------------------
 drivers/base/arch_numa.c     | 21 +--------------------
 include/linux/numa.h         |  2 ++
 mm/numa.c                    | 27 +++++++++++++++++++++++++++
 9 files changed, 39 insertions(+), 119 deletions(-)

Comments

David Hildenbrand July 17, 2024, 2:42 p.m. UTC | #1
On 16.07.24 13:13, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> Architectures that support NUMA duplicate the code that allocates
> NODE_DATA on the node-local memory with slight variations in reporting
> of the addresses where the memory was allocated.
> 
> Use x86 version as the basis for the generic alloc_node_data() function
> and call this function in architecture specific numa initialization.
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---

[...]

> diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
> index 9208eaadf690..909f6cec3a26 100644
> --- a/arch/mips/loongson64/numa.c
> +++ b/arch/mips/loongson64/numa.c
> @@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
>   
>   static void __init node_mem_init(unsigned int node)
>   {
> -	struct pglist_data *nd;
>   	unsigned long node_addrspace_offset;
>   	unsigned long start_pfn, end_pfn;
> -	unsigned long nd_pa;
> -	int tnid;
> -	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);

One interesting change is that we now always round up to full pages on 
architectures where we previously rounded up to SMP_CACHE_BYTES.

I assume we don't really expect a significant growth in memory 
consumption that we care about, especially because most systems with 
many nodes also have  quite some memory around.


> -/* Allocate NODE_DATA for a node on the local memory */
> -static void __init alloc_node_data(int nid)
> -{
> -	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
> -	u64 nd_pa;
> -	void *nd;
> -	int tnid;
> -
> -	/*
> -	 * Allocate node data.  Try node-local memory and then any node.
> -	 * Never allocate in DMA zone.
> -	 */
> -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> -	if (!nd_pa) {
> -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> -		       nd_size, nid);
> -		return;
> -	}
> -	nd = __va(nd_pa);
> -
> -	/* report and initialize */
> -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> -	       nd_pa, nd_pa + nd_size - 1);
> -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> -	if (tnid != nid)
> -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
> -
> -	node_data[nid] = nd;
> -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> -	node_set_online(nid);
> -}
> -
>   /**
>    * numa_cleanup_meminfo - Cleanup a numa_meminfo
>    * @mi: numa_meminfo to clean up
> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>   			continue;
>   
>   		alloc_node_data(nid);
> +		node_set_online(nid);
>   	}

I can spot that we only remove a single node_set_online() call from x86.

What about all the other architectures? Will there be any change in 
behavior for them? Or do we simply set the nodes online later once more?
Mike Rapoport July 18, 2024, 7:02 a.m. UTC | #2
On Wed, Jul 17, 2024 at 04:42:48PM +0200, David Hildenbrand wrote:
> On 16.07.24 13:13, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> > 
> > Architectures that support NUMA duplicate the code that allocates
> > NODE_DATA on the node-local memory with slight variations in reporting
> > of the addresses where the memory was allocated.
> > 
> > Use x86 version as the basis for the generic alloc_node_data() function
> > and call this function in architecture specific numa initialization.
> > 
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > ---
> 
> [...]
> 
> > diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
> > index 9208eaadf690..909f6cec3a26 100644
> > --- a/arch/mips/loongson64/numa.c
> > +++ b/arch/mips/loongson64/numa.c
> > @@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
> >   static void __init node_mem_init(unsigned int node)
> >   {
> > -	struct pglist_data *nd;
> >   	unsigned long node_addrspace_offset;
> >   	unsigned long start_pfn, end_pfn;
> > -	unsigned long nd_pa;
> > -	int tnid;
> > -	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
> 
> One interesting change is that we now always round up to full pages on
> architectures where we previously rounded up to SMP_CACHE_BYTES.

On my workstation struct pglist_data take 174400, cachelines: 2725, members: 43 */
 
> I assume we don't really expect a significant growth in memory consumption
> that we care about, especially because most systems with many nodes also
> have  quite some memory around.

With Debian kernel configuration for 6.5 struct pglist data takes 174400
bytes so the increase here is below 1%.

For NUMA systems with a lot of nodes that shouldn't be a problem.

> > -/* Allocate NODE_DATA for a node on the local memory */
> > -static void __init alloc_node_data(int nid)
> > -{
> > -	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
> > -	u64 nd_pa;
> > -	void *nd;
> > -	int tnid;
> > -
> > -	/*
> > -	 * Allocate node data.  Try node-local memory and then any node.
> > -	 * Never allocate in DMA zone.
> > -	 */
> > -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> > -	if (!nd_pa) {
> > -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> > -		       nd_size, nid);
> > -		return;
> > -	}
> > -	nd = __va(nd_pa);
> > -
> > -	/* report and initialize */
> > -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> > -	       nd_pa, nd_pa + nd_size - 1);
> > -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> > -	if (tnid != nid)
> > -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
> > -
> > -	node_data[nid] = nd;
> > -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > -
> > -	node_set_online(nid);
> > -}
> > -
> >   /**
> >    * numa_cleanup_meminfo - Cleanup a numa_meminfo
> >    * @mi: numa_meminfo to clean up
> > @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> >   			continue;
> >   		alloc_node_data(nid);
> > +		node_set_online(nid);
> >   	}
> 
> I can spot that we only remove a single node_set_online() call from x86.
> 
> What about all the other architectures? Will there be any change in behavior
> for them? Or do we simply set the nodes online later once more?

On x86 node_set_online() was a part of alloc_node_data() and I moved it
outside so it's called right after alloc_node_data(). On other
architectures the allocation didn't include that call, so there should be
no difference there.
 
> -- 
> Cheers,
> 
> David / dhildenb
> 
>
David Hildenbrand July 19, 2024, 3:07 p.m. UTC | #3
>>> -	 * Allocate node data.  Try node-local memory and then any node.
>>> -	 * Never allocate in DMA zone.
>>> -	 */
>>> -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
>>> -	if (!nd_pa) {
>>> -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
>>> -		       nd_size, nid);
>>> -		return;
>>> -	}
>>> -	nd = __va(nd_pa);
>>> -
>>> -	/* report and initialize */
>>> -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
>>> -	       nd_pa, nd_pa + nd_size - 1);
>>> -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
>>> -	if (tnid != nid)
>>> -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
>>> -
>>> -	node_data[nid] = nd;
>>> -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
>>> -
>>> -	node_set_online(nid);
>>> -}
>>> -
>>>    /**
>>>     * numa_cleanup_meminfo - Cleanup a numa_meminfo
>>>     * @mi: numa_meminfo to clean up
>>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>>>    			continue;
>>>    		alloc_node_data(nid);
>>> +		node_set_online(nid);
>>>    	}
>>
>> I can spot that we only remove a single node_set_online() call from x86.
>>
>> What about all the other architectures? Will there be any change in behavior
>> for them? Or do we simply set the nodes online later once more?
> 
> On x86 node_set_online() was a part of alloc_node_data() and I moved it
> outside so it's called right after alloc_node_data(). On other
> architectures the allocation didn't include that call, so there should be
> no difference there.

But won't their arch code try setting the nodes online at a later stage?

And I think, some architectures only set nodes online conditionally
(see most other node_set_online() calls).

Sorry if I'm confused here, but with now unconditional node_set_online(), won't
we change the behavior of other architectures?
Mike Rapoport July 19, 2024, 3:34 p.m. UTC | #4
On Fri, Jul 19, 2024 at 05:07:35PM +0200, David Hildenbrand wrote:
> > > > -	 * Allocate node data.  Try node-local memory and then any node.
> > > > -	 * Never allocate in DMA zone.
> > > > -	 */
> > > > -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> > > > -	if (!nd_pa) {
> > > > -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> > > > -		       nd_size, nid);
> > > > -		return;
> > > > -	}
> > > > -	nd = __va(nd_pa);
> > > > -
> > > > -	/* report and initialize */
> > > > -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> > > > -	       nd_pa, nd_pa + nd_size - 1);
> > > > -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> > > > -	if (tnid != nid)
> > > > -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
> > > > -
> > > > -	node_data[nid] = nd;
> > > > -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> > > > -
> > > > -	node_set_online(nid);
> > > > -}
> > > > -
> > > >    /**
> > > >     * numa_cleanup_meminfo - Cleanup a numa_meminfo
> > > >     * @mi: numa_meminfo to clean up
> > > > @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> > > >    			continue;
> > > >    		alloc_node_data(nid);
> > > > +		node_set_online(nid);
> > > >    	}
> > > 
> > > I can spot that we only remove a single node_set_online() call from x86.
> > > 
> > > What about all the other architectures? Will there be any change in behavior
> > > for them? Or do we simply set the nodes online later once more?
> > 
> > On x86 node_set_online() was a part of alloc_node_data() and I moved it
> > outside so it's called right after alloc_node_data(). On other
> > architectures the allocation didn't include that call, so there should be
> > no difference there.
> 
> But won't their arch code try setting the nodes online at a later stage?
> 
> And I think, some architectures only set nodes online conditionally
> (see most other node_set_online() calls).
> 
> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
> we change the behavior of other architectures?

The generic alloc_node_data() does not set the node online:

+/* Allocate NODE_DATA for a node on the local memory */
+void __init alloc_node_data(int nid)
+{
+	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+	u64 nd_pa;
+	void *nd;
+	int tnid;
+
+	/* Allocate node data.  Try node-local memory and then any node. */
+	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+	if (!nd_pa)
+		panic("Cannot allocate %zu bytes for node %d data\n",
+		      nd_size, nid);
+	nd = __va(nd_pa);
+
+	/* report and initialize */
+	pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+		nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		pr_info("    NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	node_data[nid] = nd;
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+}

I might have missed some architecture except x86 that calls
node_set_online() in its alloc_node_data(), but the intention was to leave
that call outside the alloc and explicitly add it after the call to
alloc_node_data() if needed like in x86.

> -- 
> Cheers,
> 
> David / dhildenb
> 
>
David Hildenbrand July 19, 2024, 3:46 p.m. UTC | #5
On 19.07.24 17:34, Mike Rapoport wrote:
> On Fri, Jul 19, 2024 at 05:07:35PM +0200, David Hildenbrand wrote:
>>>>> -	 * Allocate node data.  Try node-local memory and then any node.
>>>>> -	 * Never allocate in DMA zone.
>>>>> -	 */
>>>>> -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
>>>>> -	if (!nd_pa) {
>>>>> -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
>>>>> -		       nd_size, nid);
>>>>> -		return;
>>>>> -	}
>>>>> -	nd = __va(nd_pa);
>>>>> -
>>>>> -	/* report and initialize */
>>>>> -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
>>>>> -	       nd_pa, nd_pa + nd_size - 1);
>>>>> -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
>>>>> -	if (tnid != nid)
>>>>> -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
>>>>> -
>>>>> -	node_data[nid] = nd;
>>>>> -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
>>>>> -
>>>>> -	node_set_online(nid);
>>>>> -}
>>>>> -
>>>>>     /**
>>>>>      * numa_cleanup_meminfo - Cleanup a numa_meminfo
>>>>>      * @mi: numa_meminfo to clean up
>>>>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>>>>>     			continue;
>>>>>     		alloc_node_data(nid);
>>>>> +		node_set_online(nid);
>>>>>     	}
>>>>
>>>> I can spot that we only remove a single node_set_online() call from x86.
>>>>
>>>> What about all the other architectures? Will there be any change in behavior
>>>> for them? Or do we simply set the nodes online later once more?
>>>
>>> On x86 node_set_online() was a part of alloc_node_data() and I moved it
>>> outside so it's called right after alloc_node_data(). On other
>>> architectures the allocation didn't include that call, so there should be
>>> no difference there.
>>
>> But won't their arch code try setting the nodes online at a later stage?
>>
>> And I think, some architectures only set nodes online conditionally
>> (see most other node_set_online() calls).
>>
>> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
>> we change the behavior of other architectures?
> 
> The generic alloc_node_data() does not set the node online:
> 
> +/* Allocate NODE_DATA for a node on the local memory */
> +void __init alloc_node_data(int nid)
> +{
> +	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
> +	u64 nd_pa;
> +	void *nd;
> +	int tnid;
> +
> +	/* Allocate node data.  Try node-local memory and then any node. */
> +	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> +	if (!nd_pa)
> +		panic("Cannot allocate %zu bytes for node %d data\n",
> +		      nd_size, nid);
> +	nd = __va(nd_pa);
> +
> +	/* report and initialize */
> +	pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> +		nd_pa, nd_pa + nd_size - 1);
> +	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> +	if (tnid != nid)
> +		pr_info("    NODE_DATA(%d) on node %d\n", nid, tnid);
> +
> +	node_data[nid] = nd;
> +	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> +}
> 
> I might have missed some architecture except x86 that calls
> node_set_online() in its alloc_node_data(), but the intention was to leave
> that call outside the alloc and explicitly add it after the call to
> alloc_node_data() if needed like in x86.

I'm stupid, I didn't realize it is still only called from x86 :(

Acked-by: David Hildenbrand <david@redhat.com>
Jonathan Cameron July 19, 2024, 3:51 p.m. UTC | #6
On Fri, 19 Jul 2024 17:07:35 +0200
David Hildenbrand <david@redhat.com> wrote:

> >>> -	 * Allocate node data.  Try node-local memory and then any node.
> >>> -	 * Never allocate in DMA zone.
> >>> -	 */
> >>> -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
> >>> -	if (!nd_pa) {
> >>> -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
> >>> -		       nd_size, nid);
> >>> -		return;
> >>> -	}
> >>> -	nd = __va(nd_pa);
> >>> -
> >>> -	/* report and initialize */
> >>> -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
> >>> -	       nd_pa, nd_pa + nd_size - 1);
> >>> -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
> >>> -	if (tnid != nid)
> >>> -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
> >>> -
> >>> -	node_data[nid] = nd;
> >>> -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> >>> -
> >>> -	node_set_online(nid);
> >>> -}
> >>> -
> >>>    /**
> >>>     * numa_cleanup_meminfo - Cleanup a numa_meminfo
> >>>     * @mi: numa_meminfo to clean up
> >>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> >>>    			continue;
> >>>    		alloc_node_data(nid);
> >>> +		node_set_online(nid);
> >>>    	}  
> >>
> >> I can spot that we only remove a single node_set_online() call from x86.
> >>
> >> What about all the other architectures? Will there be any change in behavior
> >> for them? Or do we simply set the nodes online later once more?  
> > 
> > On x86 node_set_online() was a part of alloc_node_data() and I moved it
> > outside so it's called right after alloc_node_data(). On other
> > architectures the allocation didn't include that call, so there should be
> > no difference there.  
> 
> But won't their arch code try setting the nodes online at a later stage?
> 
> And I think, some architectures only set nodes online conditionally
> (see most other node_set_online() calls).
> 
> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
> we change the behavior of other architectures?
This is moving x86 code to x86 code, not a generic location
so how would that affect anyone else? Their onlining should be same as
before.

The node onlining difference are a pain (I recall that fun from adding
generic initiators) as different ordering on x86 and arm64 at least.

Jonathan

>
David Hildenbrand July 19, 2024, 4:07 p.m. UTC | #7
On 19.07.24 17:51, Jonathan Cameron wrote:
> On Fri, 19 Jul 2024 17:07:35 +0200
> David Hildenbrand <david@redhat.com> wrote:
> 
>>>>> -	 * Allocate node data.  Try node-local memory and then any node.
>>>>> -	 * Never allocate in DMA zone.
>>>>> -	 */
>>>>> -	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
>>>>> -	if (!nd_pa) {
>>>>> -		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
>>>>> -		       nd_size, nid);
>>>>> -		return;
>>>>> -	}
>>>>> -	nd = __va(nd_pa);
>>>>> -
>>>>> -	/* report and initialize */
>>>>> -	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
>>>>> -	       nd_pa, nd_pa + nd_size - 1);
>>>>> -	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
>>>>> -	if (tnid != nid)
>>>>> -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
>>>>> -
>>>>> -	node_data[nid] = nd;
>>>>> -	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
>>>>> -
>>>>> -	node_set_online(nid);
>>>>> -}
>>>>> -
>>>>>     /**
>>>>>      * numa_cleanup_meminfo - Cleanup a numa_meminfo
>>>>>      * @mi: numa_meminfo to clean up
>>>>> @@ -571,6 +538,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>>>>>     			continue;
>>>>>     		alloc_node_data(nid);
>>>>> +		node_set_online(nid);
>>>>>     	}
>>>>
>>>> I can spot that we only remove a single node_set_online() call from x86.
>>>>
>>>> What about all the other architectures? Will there be any change in behavior
>>>> for them? Or do we simply set the nodes online later once more?
>>>
>>> On x86 node_set_online() was a part of alloc_node_data() and I moved it
>>> outside so it's called right after alloc_node_data(). On other
>>> architectures the allocation didn't include that call, so there should be
>>> no difference there.
>>
>> But won't their arch code try setting the nodes online at a later stage?
>>
>> And I think, some architectures only set nodes online conditionally
>> (see most other node_set_online() calls).
>>
>> Sorry if I'm confused here, but with now unconditional node_set_online(), won't
>> we change the behavior of other architectures?
> This is moving x86 code to x86 code, not a generic location
> so how would that affect anyone else? Their onlining should be same as
> before.

Yes, see my reply to Mike.

> 
> The node onlining difference are a pain (I recall that fun from adding
> generic initiators) as different ordering on x86 and arm64 at least.

That's part of the reason I was confused, because I remember some nasty 
inconsistency.
Jonathan Cameron July 19, 2024, 4:11 p.m. UTC | #8
On Tue, 16 Jul 2024 14:13:34 +0300
Mike Rapoport <rppt@kernel.org> wrote:

> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> Architectures that support NUMA duplicate the code that allocates
> NODE_DATA on the node-local memory with slight variations in reporting
> of the addresses where the memory was allocated.
> 
> Use x86 version as the basis for the generic alloc_node_data() function
> and call this function in architecture specific numa initialization.
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>


I've no idea what rules are for the sparc prom_printf() calls but given
that file already has mix and match of those and normal prints in
single functions I assume this change is fine and we'll just
see the prints a bit later.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Mike Rapoport July 20, 2024, 10:24 a.m. UTC | #9
On Wed, Jul 17, 2024 at 04:42:48PM +0200, David Hildenbrand wrote:
> On 16.07.24 13:13, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> > 
> > Architectures that support NUMA duplicate the code that allocates
> > NODE_DATA on the node-local memory with slight variations in reporting
> > of the addresses where the memory was allocated.
> > 
> > Use x86 version as the basis for the generic alloc_node_data() function
> > and call this function in architecture specific numa initialization.
> > 
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > ---
> 
> [...]
> 
> > diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
> > index 9208eaadf690..909f6cec3a26 100644
> > --- a/arch/mips/loongson64/numa.c
> > +++ b/arch/mips/loongson64/numa.c
> > @@ -81,12 +81,8 @@ static void __init init_topology_matrix(void)
> >   static void __init node_mem_init(unsigned int node)
> >   {
> > -	struct pglist_data *nd;
> >   	unsigned long node_addrspace_offset;
> >   	unsigned long start_pfn, end_pfn;
> > -	unsigned long nd_pa;
> > -	int tnid;
> > -	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
> 
> One interesting change is that we now always round up to full pages on
> architectures where we previously rounded up to SMP_CACHE_BYTES.

I did some git archaeology and it seems that round up to full pages on x86
backdates to bootmem era when allocation granularity was PAGE_SIZE anyway.
I'm going to change that to SMP_CACHE_BYTES in v2.
 
> I assume we don't really expect a significant growth in memory consumption
> that we care about, especially because most systems with many nodes also
> have  quite some memory around.
diff mbox series

Patch

diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index acada671e020..84fe7f854820 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -187,24 +187,6 @@  int __init numa_add_memblk(int nid, u64 start, u64 end)
 	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
 }
 
-static void __init alloc_node_data(int nid)
-{
-	void *nd;
-	unsigned long nd_pa;
-	size_t nd_sz = roundup(sizeof(pg_data_t), PAGE_SIZE);
-
-	nd_pa = memblock_phys_alloc_try_nid(nd_sz, SMP_CACHE_BYTES, nid);
-	if (!nd_pa) {
-		pr_err("Cannot find %zu Byte for node_data (initial node: %d)\n", nd_sz, nid);
-		return;
-	}
-
-	nd = __va(nd_pa);
-
-	node_data[nid] = nd;
-	memset(nd, 0, sizeof(pg_data_t));
-}
-
 static void __init node_mem_init(unsigned int node)
 {
 	unsigned long start_pfn, end_pfn;
diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
index 9208eaadf690..909f6cec3a26 100644
--- a/arch/mips/loongson64/numa.c
+++ b/arch/mips/loongson64/numa.c
@@ -81,12 +81,8 @@  static void __init init_topology_matrix(void)
 
 static void __init node_mem_init(unsigned int node)
 {
-	struct pglist_data *nd;
 	unsigned long node_addrspace_offset;
 	unsigned long start_pfn, end_pfn;
-	unsigned long nd_pa;
-	int tnid;
-	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
 
 	node_addrspace_offset = nid_to_addrbase(node);
 	pr_info("Node%d's addrspace_offset is 0x%lx\n",
@@ -96,16 +92,8 @@  static void __init node_mem_init(unsigned int node)
 	pr_info("Node%d: start_pfn=0x%lx, end_pfn=0x%lx\n",
 		node, start_pfn, end_pfn);
 
-	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, node);
-	if (!nd_pa)
-		panic("Cannot allocate %zu bytes for node %d data\n",
-		      nd_size, node);
-	nd = __va(nd_pa);
-	memset(nd, 0, sizeof(struct pglist_data));
-	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != node)
-		pr_info("NODE_DATA(%d) on node %d\n", node, tnid);
-	node_data[node] = nd;
+	alloc_node_data(node);
+
 	NODE_DATA(node)->node_start_pfn = start_pfn;
 	NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn;
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8c18973cd71e..4c54764af160 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1081,27 +1081,9 @@  void __init dump_numa_cpu_topology(void)
 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 {
 	u64 spanned_pages = end_pfn - start_pfn;
-	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
-	u64 nd_pa;
-	void *nd;
-	int tnid;
-
-	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
-	if (!nd_pa)
-		panic("Cannot allocate %zu bytes for node %d data\n",
-		      nd_size, nid);
-
-	nd = __va(nd_pa);
-
-	/* report and initialize */
-	pr_info("  NODE_DATA [mem %#010Lx-%#010Lx]\n",
-		nd_pa, nd_pa + nd_size - 1);
-	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != nid)
-		pr_info("    NODE_DATA(%d) on node %d\n", nid, tnid);
-
-	node_data[nid] = nd;
-	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+
+	alloc_node_data(nid);
+
 	NODE_DATA(nid)->node_id = nid;
 	NODE_DATA(nid)->node_start_pfn = start_pfn;
 	NODE_DATA(nid)->node_spanned_pages = spanned_pages;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index bf1b54055316..5cc89a0932c3 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -212,12 +212,7 @@  void __init allocate_pgdat(unsigned int nid)
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 
 #ifdef CONFIG_NUMA
-	NODE_DATA(nid) = memblock_alloc_try_nid(
-				sizeof(struct pglist_data),
-				SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
-				MEMBLOCK_ALLOC_ACCESSIBLE, nid);
-	if (!NODE_DATA(nid))
-		panic("Can't allocate pgdat for node %d\n", nid);
+	alloc_node_data(nid);
 #endif
 
 	NODE_DATA(nid)->node_start_pfn = start_pfn;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3cb698204609..83279c43572d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1075,14 +1075,9 @@  static void __init allocate_node_data(int nid)
 {
 	struct pglist_data *p;
 	unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NUMA
 
-	NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
-					     SMP_CACHE_BYTES, nid);
-	if (!NODE_DATA(nid)) {
-		prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
-		prom_halt();
-	}
+#ifdef CONFIG_NUMA
+	alloc_node_data(nid);
 
 	NODE_DATA(nid)->node_id = nid;
 #endif
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 7de725d6bb05..5e1dde26674b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -191,39 +191,6 @@  int __init numa_add_memblk(int nid, u64 start, u64 end)
 	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
 }
 
-/* Allocate NODE_DATA for a node on the local memory */
-static void __init alloc_node_data(int nid)
-{
-	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
-	u64 nd_pa;
-	void *nd;
-	int tnid;
-
-	/*
-	 * Allocate node data.  Try node-local memory and then any node.
-	 * Never allocate in DMA zone.
-	 */
-	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
-	if (!nd_pa) {
-		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
-		       nd_size, nid);
-		return;
-	}
-	nd = __va(nd_pa);
-
-	/* report and initialize */
-	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
-	       nd_pa, nd_pa + nd_size - 1);
-	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != nid)
-		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
-
-	node_data[nid] = nd;
-	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
-	node_set_online(nid);
-}
-
 /**
  * numa_cleanup_meminfo - Cleanup a numa_meminfo
  * @mi: numa_meminfo to clean up
@@ -571,6 +538,7 @@  static int __init numa_register_memblks(struct numa_meminfo *mi)
 			continue;
 
 		alloc_node_data(nid);
+		node_set_online(nid);
 	}
 
 	/* Dump memblock with node info and return. */
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 9b71ad2869f1..2ebf12eab99f 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -216,30 +216,11 @@  int __init numa_add_memblk(int nid, u64 start, u64 end)
  */
 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 {
-	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
-	u64 nd_pa;
-	void *nd;
-	int tnid;
-
 	if (start_pfn >= end_pfn)
 		pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
 
-	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
-	if (!nd_pa)
-		panic("Cannot allocate %zu bytes for node %d data\n",
-		      nd_size, nid);
-
-	nd = __va(nd_pa);
-
-	/* report and initialize */
-	pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
-		nd_pa, nd_pa + nd_size - 1);
-	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != nid)
-		pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
+	alloc_node_data(nid);
 
-	node_data[nid] = nd;
-	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
 	NODE_DATA(nid)->node_id = nid;
 	NODE_DATA(nid)->node_start_pfn = start_pfn;
 	NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
diff --git a/include/linux/numa.h b/include/linux/numa.h
index e5841d4057ab..3b12d8ca0afd 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -33,6 +33,8 @@  static inline bool numa_valid_node(int nid)
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)	(node_data[nid])
 
+void __init alloc_node_data(int nid);
+
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
diff --git a/mm/numa.c b/mm/numa.c
index 8c157d41c026..0483cabc4c4b 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -1,11 +1,38 @@ 
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <linux/memblock.h>
 #include <linux/printk.h>
 #include <linux/numa.h>
 
 struct pglist_data *node_data[MAX_NUMNODES];
 EXPORT_SYMBOL(node_data);
 
+/* Allocate NODE_DATA for a node on the local memory */
+void __init alloc_node_data(int nid)
+{
+	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+	u64 nd_pa;
+	void *nd;
+	int tnid;
+
+	/* Allocate node data.  Try node-local memory and then any node. */
+	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+	if (!nd_pa)
+		panic("Cannot allocate %zu bytes for node %d data\n",
+		      nd_size, nid);
+	nd = __va(nd_pa);
+
+	/* report and initialize */
+	pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+		nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		pr_info("    NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	node_data[nid] = nd;
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+}
+
 /* Stub functions: */
 
 #ifndef memory_add_physaddr_to_nid