Message ID | 20240801060826.559858-8-rppt@kernel.org (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | mm: introduce numa_memblks | expand |
On Thu, 1 Aug 2024 09:08:07 +0300 Mike Rapoport <rppt@kernel.org> wrote: > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org> > > There are no users of HAVE_ARCH_NODEDATA_EXTENSION left, so > arch_alloc_nodedata() and arch_refresh_nodedata() are not needed > anymore. > > Replace the call to arch_alloc_nodedata() in free_area_init() with > memblock_alloc(), remove arch_refresh_nodedata() and cleanup > include/linux/memory_hotplug.h from the associated ifdefery. > > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> > Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64 Hi Mike, This has an accidental (I assume) functional change and if you have an initially offline node it all goes wrong. > --- > include/linux/memory_hotplug.h | 48 ---------------------------------- > mm/mm_init.c | 3 +-- > 2 files changed, 1 insertion(+), 50 deletions(-) > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index ebe876930e78..b27ddce5d324 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -16,54 +16,6 @@ struct resource; > struct vmem_altmap; > struct dev_pagemap; > > -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION > -/* > - * For supporting node-hotadd, we have to allocate a new pgdat. > - * > - * If an arch has generic style NODE_DATA(), > - * node_data[nid] = kzalloc() works well. But it depends on the architecture. > - * > - * In general, generic_alloc_nodedata() is used. > - * > - */ > -extern pg_data_t *arch_alloc_nodedata(int nid); > -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); > - > -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ > - > -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) > - > -#ifdef CONFIG_NUMA > -/* > - * XXX: node aware allocation can't work well to get new node's memory at this time. > - * Because, pgdat for the new node is not allocated/initialized yet itself. > - * To use new node's memory, more consideration will be necessary. > - */ > -#define generic_alloc_nodedata(nid) \ > -({ \ > - memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ > -}) > - > -extern pg_data_t *node_data[]; > -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) > -{ > - node_data[nid] = pgdat; > -} > - > -#else /* !CONFIG_NUMA */ > - > -/* never called */ > -static inline pg_data_t *generic_alloc_nodedata(int nid) > -{ > - BUG(); > - return NULL; > -} > -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) > -{ > -} > -#endif /* CONFIG_NUMA */ > -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ > - > #ifdef CONFIG_MEMORY_HOTPLUG > struct page *pfn_to_online_page(unsigned long pfn); > > diff --git a/mm/mm_init.c b/mm/mm_init.c > index 75c3bd42799b..bcc2f2dd8021 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) > > if (!node_online(nid)) { > /* Allocator not initialized yet */ > - pgdat = arch_alloc_nodedata(nid); > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > if (!pgdat) > panic("Cannot allocate %zuB for node %d.\n", > sizeof(*pgdat), nid); > - arch_refresh_nodedata(nid, pgdat); This allocates pgdat but never sets node_data[nid] to it and promptly leaks it on the line below. Just to sanity check this I spun up a qemu machine with no memory initially present on some nodes and it went boom as you'd expect. I tested with addition of NODE_DATA(nid) = pgdat; and it all seems to work as expected. Jonathan > } > > pgdat = NODE_DATA(nid);
On Fri, 2 Aug 2024 10:49:22 +0100 Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote: > > --- a/mm/mm_init.c > > +++ b/mm/mm_init.c > > @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) > > > > if (!node_online(nid)) { > > /* Allocator not initialized yet */ > > - pgdat = arch_alloc_nodedata(nid); > > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > > if (!pgdat) > > panic("Cannot allocate %zuB for node %d.\n", > > sizeof(*pgdat), nid); > > - arch_refresh_nodedata(nid, pgdat); > > This allocates pgdat but never sets node_data[nid] to it > and promptly leaks it on the line below. > > Just to sanity check this I spun up a qemu machine with no memory > initially present on some nodes and it went boom as you'd expect. > > I tested with addition of > NODE_DATA(nid) = pgdat; > and it all seems to work as expected. Thanks, I added that. It blew up on x86_64 allnoconfig because node_data[] (and hence NODE_DATA()) isn't an lvalue when CONFIG_NUMA=n. I'll put some #ifdef CONFIG_NUMAs in there for now but a) NODE_DATA() is upper-case. Implies "constant". Shouldn't be assigned to. b) NODE_DATA() should be non-lvalue when CONFIG_NUMA=y also. But no, we insist on implementing things in cpp instead of in C. c) In fact assigning to anything which ends in "()" is nuts. Please clean up my tempfix. c) Mike, generally I'm wondering if there's a bunch of code here which isn't needed on CONFIG_NUMA=n. Please check all of this for unneeded bloatiness.
On Sat, Aug 03, 2024 at 11:58:13AM -0700, Andrew Morton wrote: > On Fri, 2 Aug 2024 10:49:22 +0100 Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote: > > > > --- a/mm/mm_init.c > > > +++ b/mm/mm_init.c > > > @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) > > > > > > if (!node_online(nid)) { > > > /* Allocator not initialized yet */ > > > - pgdat = arch_alloc_nodedata(nid); > > > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > > > if (!pgdat) > > > panic("Cannot allocate %zuB for node %d.\n", > > > sizeof(*pgdat), nid); > > > - arch_refresh_nodedata(nid, pgdat); > > > > This allocates pgdat but never sets node_data[nid] to it > > and promptly leaks it on the line below. > > > > Just to sanity check this I spun up a qemu machine with no memory > > initially present on some nodes and it went boom as you'd expect. > > > > I tested with addition of > > NODE_DATA(nid) = pgdat; > > and it all seems to work as expected. > > Thanks, I added that. It blew up on x86_64 allnoconfig because > node_data[] (and hence NODE_DATA()) isn't an lvalue when CONFIG_NUMA=n. > > I'll put some #ifdef CONFIG_NUMAs in there for now but > > a) NODE_DATA() is upper-case. Implies "constant". Shouldn't be assigned to. > > b) NODE_DATA() should be non-lvalue when CONFIG_NUMA=y also. But no, > we insist on implementing things in cpp instead of in C. This looks like a candidate for a separate tree-wide cleanup. > c) In fact assigning to anything which ends in "()" is nuts. Please > clean up my tempfix. > > c) Mike, generally I'm wondering if there's a bunch of code here > which isn't needed on CONFIG_NUMA=n. Please check all of this for > unneeded bloatiness. I believe the patch addresses your concerns, just with this the commit log needs update. Instead of Replace the call to arch_alloc_nodedata() in free_area_init() with memblock_alloc(), remove arch_refresh_nodedata() and cleanup include/linux/memory_hotplug.h from the associated ifdefery. it should be Replace the call to arch_alloc_nodedata() in free_area_init() with a new helper alloc_offline_node_data(), remove arch_refresh_nodedata() and cleanup include/linux/memory_hotplug.h from the associated ifdefery. I can send an updated patch if you prefer. diff --git a/include/linux/numa.h b/include/linux/numa.h index 3b12d8ca0afd..5a749fd67f39 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -34,6 +34,7 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) void __init alloc_node_data(int nid); +void __init alloc_offline_node_data(int nit); /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); @@ -62,6 +63,8 @@ static inline int phys_to_target_node(u64 start) { return 0; } + +static inline void alloc_offline_node_data(int nit) {} #endif #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) diff --git a/mm/mm_init.c b/mm/mm_init.c index bcc2f2dd8021..2785be04e7bb 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1836,13 +1836,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) for_each_node(nid) { pg_data_t *pgdat; - if (!node_online(nid)) { - /* Allocator not initialized yet */ - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - } + if (!node_online(nid)) + alloc_offline_node_data(nid); pgdat = NODE_DATA(nid); free_area_init_node(nid); diff --git a/mm/numa.c b/mm/numa.c index da27eb151dc5..07e486a977c7 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -34,6 +34,18 @@ void __init alloc_node_data(int nid) memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } +void __init alloc_offline_node_data(int nit) +{ + pg_data_t *pgdat; + + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + + node_data[nid] = pgdat; +} + /* Stub functions: */ #ifndef memory_add_physaddr_to_nid
On Sun, 4 Aug 2024 10:24:15 +0300 Mike Rapoport <rppt@kernel.org> wrote: > On Sat, Aug 03, 2024 at 11:58:13AM -0700, Andrew Morton wrote: > > On Fri, 2 Aug 2024 10:49:22 +0100 Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote: > > > > > > --- a/mm/mm_init.c > > > > +++ b/mm/mm_init.c > > > > @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) > > > > > > > > if (!node_online(nid)) { > > > > /* Allocator not initialized yet */ > > > > - pgdat = arch_alloc_nodedata(nid); > > > > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > > > > if (!pgdat) > > > > panic("Cannot allocate %zuB for node %d.\n", > > > > sizeof(*pgdat), nid); > > > > - arch_refresh_nodedata(nid, pgdat); > > > > > > This allocates pgdat but never sets node_data[nid] to it > > > and promptly leaks it on the line below. > > > > > > Just to sanity check this I spun up a qemu machine with no memory > > > initially present on some nodes and it went boom as you'd expect. > > > > > > I tested with addition of > > > NODE_DATA(nid) = pgdat; > > > and it all seems to work as expected. > > > > Thanks, I added that. It blew up on x86_64 allnoconfig because > > node_data[] (and hence NODE_DATA()) isn't an lvalue when CONFIG_NUMA=n. > > > > I'll put some #ifdef CONFIG_NUMAs in there for now but > > > > a) NODE_DATA() is upper-case. Implies "constant". Shouldn't be assigned to. > > > > b) NODE_DATA() should be non-lvalue when CONFIG_NUMA=y also. But no, > > we insist on implementing things in cpp instead of in C. > > This looks like a candidate for a separate tree-wide cleanup. > > > c) In fact assigning to anything which ends in "()" is nuts. Please > > clean up my tempfix. > > > > c) Mike, generally I'm wondering if there's a bunch of code here > > which isn't needed on CONFIG_NUMA=n. Please check all of this for > > unneeded bloatiness. > > I believe the patch addresses your concerns, just with this the commit log > needs update. Instead of > > Replace the call to arch_alloc_nodedata() in free_area_init() with > memblock_alloc(), remove arch_refresh_nodedata() and cleanup > include/linux/memory_hotplug.h from the associated ifdefery. > > it should be > > Replace the call to arch_alloc_nodedata() in free_area_init() with a > new helper alloc_offline_node_data(), remove arch_refresh_nodedata() > and cleanup include/linux/memory_hotplug.h from the associated > ifdefery. > > I can send an updated patch if you prefer. This solution looks good to me - except for a Freudian typo that means it won't compile :) Jonathan > > diff --git a/include/linux/numa.h b/include/linux/numa.h > index 3b12d8ca0afd..5a749fd67f39 100644 > --- a/include/linux/numa.h > +++ b/include/linux/numa.h > @@ -34,6 +34,7 @@ extern struct pglist_data *node_data[]; > #define NODE_DATA(nid) (node_data[nid]) > > void __init alloc_node_data(int nid); > +void __init alloc_offline_node_data(int nit); > > /* Generic implementation available */ > int numa_nearest_node(int node, unsigned int state); > @@ -62,6 +63,8 @@ static inline int phys_to_target_node(u64 start) > { > return 0; > } > + > +static inline void alloc_offline_node_data(int nit) {} nid > #endif > > #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) > diff --git a/mm/mm_init.c b/mm/mm_init.c > index bcc2f2dd8021..2785be04e7bb 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1836,13 +1836,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) > for_each_node(nid) { > pg_data_t *pgdat; > > - if (!node_online(nid)) { > - /* Allocator not initialized yet */ > - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > - if (!pgdat) > - panic("Cannot allocate %zuB for node %d.\n", > - sizeof(*pgdat), nid); > - } > + if (!node_online(nid)) > + alloc_offline_node_data(nid); > > pgdat = NODE_DATA(nid); > free_area_init_node(nid); > diff --git a/mm/numa.c b/mm/numa.c > index da27eb151dc5..07e486a977c7 100644 > --- a/mm/numa.c > +++ b/mm/numa.c > @@ -34,6 +34,18 @@ void __init alloc_node_data(int nid) > memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); > } > > +void __init alloc_offline_node_data(int nit) nid > +{ > + pg_data_t *pgdat; > + > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > + if (!pgdat) > + panic("Cannot allocate %zuB for node %d.\n", > + sizeof(*pgdat), nid); > + > + node_data[nid] = pgdat; > +} > + > /* Stub functions: */ > > #ifndef memory_add_physaddr_to_nid > > >
On Sun, Aug 04, 2024 at 04:11:19PM +0100, Jonathan Cameron wrote: > On Sun, 4 Aug 2024 10:24:15 +0300 > Mike Rapoport <rppt@kernel.org> wrote: > > > On Sat, Aug 03, 2024 at 11:58:13AM -0700, Andrew Morton wrote: > > > On Fri, 2 Aug 2024 10:49:22 +0100 Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote: > > > > > > > > --- a/mm/mm_init.c > > > > > +++ b/mm/mm_init.c > > > > > @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) > > > > > > > > > > if (!node_online(nid)) { > > > > > /* Allocator not initialized yet */ > > > > > - pgdat = arch_alloc_nodedata(nid); > > > > > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > > > > > if (!pgdat) > > > > > panic("Cannot allocate %zuB for node %d.\n", > > > > > sizeof(*pgdat), nid); > > > > > - arch_refresh_nodedata(nid, pgdat); > > > > > > > > This allocates pgdat but never sets node_data[nid] to it > > > > and promptly leaks it on the line below. > > > > > > > > Just to sanity check this I spun up a qemu machine with no memory > > > > initially present on some nodes and it went boom as you'd expect. > > > > > > > > I tested with addition of > > > > NODE_DATA(nid) = pgdat; > > > > and it all seems to work as expected. > > > > > > Thanks, I added that. It blew up on x86_64 allnoconfig because > > > node_data[] (and hence NODE_DATA()) isn't an lvalue when CONFIG_NUMA=n. > > > > > > I'll put some #ifdef CONFIG_NUMAs in there for now but > > > > > > a) NODE_DATA() is upper-case. Implies "constant". Shouldn't be assigned to. > > > > > > b) NODE_DATA() should be non-lvalue when CONFIG_NUMA=y also. But no, > > > we insist on implementing things in cpp instead of in C. > > > > This looks like a candidate for a separate tree-wide cleanup. > > > > > c) In fact assigning to anything which ends in "()" is nuts. Please > > > clean up my tempfix. > > > > > > c) Mike, generally I'm wondering if there's a bunch of code here > > > which isn't needed on CONFIG_NUMA=n. Please check all of this for > > > unneeded bloatiness. > > > > I believe the patch addresses your concerns, just with this the commit log > > needs update. Instead of > > > > Replace the call to arch_alloc_nodedata() in free_area_init() with > > memblock_alloc(), remove arch_refresh_nodedata() and cleanup > > include/linux/memory_hotplug.h from the associated ifdefery. > > > > it should be > > > > Replace the call to arch_alloc_nodedata() in free_area_init() with a > > new helper alloc_offline_node_data(), remove arch_refresh_nodedata() > > and cleanup include/linux/memory_hotplug.h from the associated > > ifdefery. > > > > I can send an updated patch if you prefer. > This solution looks good to me - except for a Freudian typo that means it won't > compile :) Right :) I'll post v4 after kbuild confirms it compiles :) > Jonathan > > > > > diff --git a/include/linux/numa.h b/include/linux/numa.h > > index 3b12d8ca0afd..5a749fd67f39 100644 > > --- a/include/linux/numa.h > > +++ b/include/linux/numa.h > > @@ -34,6 +34,7 @@ extern struct pglist_data *node_data[]; > > #define NODE_DATA(nid) (node_data[nid]) > > > > void __init alloc_node_data(int nid); > > +void __init alloc_offline_node_data(int nit); > > > > /* Generic implementation available */ > > int numa_nearest_node(int node, unsigned int state); > > @@ -62,6 +63,8 @@ static inline int phys_to_target_node(u64 start) > > { > > return 0; > > } > > + > > +static inline void alloc_offline_node_data(int nit) {} > nid > > #endif > > > > #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) > > diff --git a/mm/mm_init.c b/mm/mm_init.c > > index bcc2f2dd8021..2785be04e7bb 100644 > > --- a/mm/mm_init.c > > +++ b/mm/mm_init.c > > @@ -1836,13 +1836,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) > > for_each_node(nid) { > > pg_data_t *pgdat; > > > > - if (!node_online(nid)) { > > - /* Allocator not initialized yet */ > > - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > > - if (!pgdat) > > - panic("Cannot allocate %zuB for node %d.\n", > > - sizeof(*pgdat), nid); > > - } > > + if (!node_online(nid)) > > + alloc_offline_node_data(nid); > > > > pgdat = NODE_DATA(nid); > > free_area_init_node(nid); > > diff --git a/mm/numa.c b/mm/numa.c > > index da27eb151dc5..07e486a977c7 100644 > > --- a/mm/numa.c > > +++ b/mm/numa.c > > @@ -34,6 +34,18 @@ void __init alloc_node_data(int nid) > > memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); > > } > > > > +void __init alloc_offline_node_data(int nit) > > nid > > > +{ > > + pg_data_t *pgdat; > > + > > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > > + if (!pgdat) > > + panic("Cannot allocate %zuB for node %d.\n", > > + sizeof(*pgdat), nid); > > + > > + node_data[nid] = pgdat; > > +} > > + > > /* Stub functions: */ > > > > #ifndef memory_add_physaddr_to_nid > > > > > > > >
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ebe876930e78..b27ddce5d324 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,54 +16,6 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) - -#ifdef CONFIG_NUMA -/* - * XXX: node aware allocation can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ -}) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); diff --git a/mm/mm_init.c b/mm/mm_init.c index 75c3bd42799b..bcc2f2dd8021 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long *max_zone_pfn) if (!node_online(nid)) { /* Allocator not initialized yet */ - pgdat = arch_alloc_nodedata(nid); + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); if (!pgdat) panic("Cannot allocate %zuB for node %d.\n", sizeof(*pgdat), nid); - arch_refresh_nodedata(nid, pgdat); } pgdat = NODE_DATA(nid);