diff mbox series

[RFC,13/15] mm: convert MAX_ORDER sized static arrays to dynamic ones.

Message ID 20210805190253.2795604-14-zi.yan@sent.com (mailing list archive)
State New
Headers show
Series Make MAX_ORDER adjustable as a kernel boot time parameter. | expand

Commit Message

Zi Yan Aug. 5, 2021, 7:02 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

This prepares for the upcoming changes to make MAX_ORDER a boot time
parameter instead of compilation time constant. All static arrays with
MAX_ORDER size are converted to pointers and their memory is allocated
at runtime.

free_area array in struct zone is allocated using memblock_alloc_node()
at boot time and using kzalloc() when memory is hot-added.

MAX_ORDER in arm64 nVHE code is independent of kernel buddy allocator,
so use CONFIG_FORCE_MAX_ZONEORDER instead.

Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 .../admin-guide/kdump/vmcoreinfo.rst          |  2 +-
 drivers/gpu/drm/ttm/ttm_device.c              |  7 ++-
 drivers/gpu/drm/ttm/ttm_pool.c                | 58 +++++++++++++++++--
 include/drm/ttm/ttm_pool.h                    |  4 +-
 include/linux/memory_hotplug.h                |  1 +
 include/linux/mmzone.h                        |  2 +-
 mm/memory_hotplug.c                           |  1 +
 mm/page_alloc.c                               | 48 ++++++++++++---
 8 files changed, 104 insertions(+), 19 deletions(-)

Comments

Christian König Aug. 5, 2021, 7:16 p.m. UTC | #1
Am 05.08.21 um 21:02 schrieb Zi Yan:
> From: Zi Yan <ziy@nvidia.com>
>
> This prepares for the upcoming changes to make MAX_ORDER a boot time
> parameter instead of compilation time constant. All static arrays with
> MAX_ORDER size are converted to pointers and their memory is allocated
> at runtime.

Well in general I strongly suggest to not use the patter 
kmalloc(sizeof(some struct) * MAX_ORDER,...) instead use kmalloc_array, 
kcalloc etc..

Then when a array is embedded at the end of a structure you can use a 
trailing array and the struct_size() macro to determine the allocation size.

Additional to that separating the patch into changes for TTM to make the 
maximum allocation order independent from MAX_ORDER would be rather good 
to have I think.

Regards,
Christian.

>
> free_area array in struct zone is allocated using memblock_alloc_node()
> at boot time and using kzalloc() when memory is hot-added.
>
> MAX_ORDER in arm64 nVHE code is independent of kernel buddy allocator,
> so use CONFIG_FORCE_MAX_ZONEORDER instead.
>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> Cc: Dave Young <dyoung@redhat.com>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: kexec@lists.infradead.org
> Cc: linux-doc@vger.kernel.org
> Cc: dri-devel@lists.freedesktop.org
> Cc: linux-mm@kvack.org
> Cc: linux-kernel@vger.kernel.org
> ---
>   .../admin-guide/kdump/vmcoreinfo.rst          |  2 +-
>   drivers/gpu/drm/ttm/ttm_device.c              |  7 ++-
>   drivers/gpu/drm/ttm/ttm_pool.c                | 58 +++++++++++++++++--
>   include/drm/ttm/ttm_pool.h                    |  4 +-
>   include/linux/memory_hotplug.h                |  1 +
>   include/linux/mmzone.h                        |  2 +-
>   mm/memory_hotplug.c                           |  1 +
>   mm/page_alloc.c                               | 48 ++++++++++++---
>   8 files changed, 104 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> index 3861a25faae1..1c9449b9458f 100644
> --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
> +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> @@ -172,7 +172,7 @@ variables.
>   Offset of the free_list's member. This value is used to compute the number
>   of free pages.
>   
> -Each zone has a free_area structure array called free_area[MAX_ORDER].
> +Each zone has a free_area structure array called free_area with length of MAX_ORDER.
>   The free_list represents a linked list of free page blocks.
>   
>   (list_head, next|prev)
> diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
> index 74e3b460132b..7d994c03fbd0 100644
> --- a/drivers/gpu/drm/ttm/ttm_device.c
> +++ b/drivers/gpu/drm/ttm/ttm_device.c
> @@ -94,7 +94,9 @@ static int ttm_global_init(void)
>   		>> PAGE_SHIFT;
>   	num_dma32 = min(num_dma32, 2UL << (30 - PAGE_SHIFT));
>   
> -	ttm_pool_mgr_init(num_pages);
> +	ret = ttm_pool_mgr_init(num_pages);
> +	if (ret)
> +		goto out;
>   	ttm_tt_mgr_init(num_pages, num_dma32);
>   
>   	glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32);
> @@ -216,7 +218,8 @@ int ttm_device_init(struct ttm_device *bdev, struct ttm_device_funcs *funcs,
>   	bdev->funcs = funcs;
>   
>   	ttm_sys_man_init(bdev);
> -	ttm_pool_init(&bdev->pool, dev, use_dma_alloc, use_dma32);
> +	if (ttm_pool_init(&bdev->pool, dev, use_dma_alloc, use_dma32))
> +		return -ENOMEM;
>   
>   	bdev->vma_manager = vma_manager;
>   	INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue);
> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> index cb38b1a17b09..ae20c80f14a4 100644
> --- a/drivers/gpu/drm/ttm/ttm_pool.c
> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> @@ -64,11 +64,11 @@ module_param(page_pool_size, ulong, 0644);
>   
>   static atomic_long_t allocated_pages;
>   
> -static struct ttm_pool_type global_write_combined[MAX_ORDER];
> -static struct ttm_pool_type global_uncached[MAX_ORDER];
> +static struct ttm_pool_type *global_write_combined;
> +static struct ttm_pool_type *global_uncached;
>   
> -static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER];
> -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER];
> +static struct ttm_pool_type *global_dma32_write_combined;
> +static struct ttm_pool_type *global_dma32_uncached;
>   
>   static struct mutex shrinker_lock;
>   static struct list_head shrinker_list;
> @@ -493,8 +493,10 @@ EXPORT_SYMBOL(ttm_pool_free);
>    * @use_dma32: true if GFP_DMA32 should be used
>    *
>    * Initialize the pool and its pool types.
> + *
> + * Returns: 0 on successe, negative error code otherwise
>    */
> -void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
> +int ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>   		   bool use_dma_alloc, bool use_dma32)
>   {
>   	unsigned int i, j;
> @@ -506,11 +508,30 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>   	pool->use_dma32 = use_dma32;
>   
>   	if (use_dma_alloc) {
> -		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
> +		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
> +			pool->caching[i].orders =
> +				kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
> +					GFP_KERNEL);
> +			if (!pool->caching[i].orders) {
> +				i--;
> +				goto failed;
> +			}
>   			for (j = 0; j < MAX_ORDER; ++j)
>   				ttm_pool_type_init(&pool->caching[i].orders[j],
>   						   pool, i, j);
> +
> +		}
> +		return 0;
> +
> +failed:
> +		for (; i >= 0; i--) {
> +			for (j = 0; j < MAX_ORDER; ++j)
> +				ttm_pool_type_fini(&pool->caching[i].orders[j]);
> +			kfree(pool->caching[i].orders);
> +		}
> +		return -ENOMEM;
>   	}
> +	return 0;
>   }
>   
>   /**
> @@ -696,6 +717,31 @@ int ttm_pool_mgr_init(unsigned long num_pages)
>   	mutex_init(&shrinker_lock);
>   	INIT_LIST_HEAD(&shrinker_list);
>   
> +	if (!global_write_combined) {
> +		global_write_combined = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
> +						GFP_KERNEL);
> +		if (!global_write_combined)
> +			return -ENOMEM;
> +	}
> +	if (!global_uncached) {
> +		global_uncached = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
> +					  GFP_KERNEL);
> +		if (!global_uncached)
> +			return -ENOMEM;
> +	}
> +	if (!global_dma32_write_combined) {
> +		global_dma32_write_combined = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
> +						      GFP_KERNEL);
> +		if (!global_dma32_write_combined)
> +			return -ENOMEM;
> +	}
> +	if (!global_dma32_uncached) {
> +		global_dma32_uncached = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
> +						GFP_KERNEL);
> +		if (!global_dma32_uncached)
> +			return -ENOMEM;
> +	}
> +
>   	for (i = 0; i < MAX_ORDER; ++i) {
>   		ttm_pool_type_init(&global_write_combined[i], NULL,
>   				   ttm_write_combined, i);
> diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
> index 4321728bdd11..5c09e3cf63ce 100644
> --- a/include/drm/ttm/ttm_pool.h
> +++ b/include/drm/ttm/ttm_pool.h
> @@ -71,7 +71,7 @@ struct ttm_pool {
>   	bool use_dma32;
>   
>   	struct {
> -		struct ttm_pool_type orders[MAX_ORDER];
> +		struct ttm_pool_type *orders;
>   	} caching[TTM_NUM_CACHING_TYPES];
>   };
>   
> @@ -79,7 +79,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>   		   struct ttm_operation_ctx *ctx);
>   void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt);
>   
> -void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
> +int ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>   		   bool use_dma_alloc, bool use_dma32);
>   void ttm_pool_fini(struct ttm_pool *pool);
>   
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 97f874a60607..c16aa66db61e 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -326,6 +326,7 @@ extern void clear_zone_contiguous(struct zone *zone);
>   
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   extern void __ref free_area_init_core_hotplug(int nid);
> +extern void __ref free_area_deinit_core_hotplug(int nid);
>   extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
>   extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
>   extern int add_memory_resource(int nid, struct resource *resource,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 322b995942e5..09aafc05aef4 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -609,7 +609,7 @@ struct zone {
>   	ZONE_PADDING(_pad1_)
>   
>   	/* free areas of different sizes */
> -	struct free_area	free_area[MAX_ORDER];
> +	struct free_area	*free_area;
>   
>   	/* zone flags, see below */
>   	unsigned long		flags;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 91ca751ac20c..4ce20b6482aa 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1239,6 +1239,7 @@ static void rollback_node_hotadd(int nid)
>   
>   	arch_refresh_nodedata(nid, NULL);
>   	free_percpu(pgdat->per_cpu_nodestats);
> +	free_area_deinit_core_hotplug(nid);
>   	arch_free_nodedata(pgdat);
>   }
>   
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e4657009fd4f..bfa6962f7615 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6053,11 +6053,21 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
>   
>   	for_each_populated_zone(zone) {
>   		unsigned int order;
> -		unsigned long nr[MAX_ORDER], flags, total = 0;
> -		unsigned char types[MAX_ORDER];
> +		unsigned long *nr, flags, total = 0;
> +		unsigned char *types;
>   
>   		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
>   			continue;
> +
> +		nr = kmalloc(sizeof(unsigned long) * MAX_ORDER, GFP_KERNEL);
> +		if (!nr)
> +			goto skip_zone;
> +		types = kmalloc(sizeof(unsigned char) * MAX_ORDER, GFP_KERNEL);
> +		if (!types) {
> +			kfree(nr);
> +			goto skip_zone;
> +		}
> +
>   		show_node(zone);
>   		printk(KERN_CONT "%s: ", zone->name);
>   
> @@ -6083,8 +6093,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
>   				show_migration_types(types[order]);
>   		}
>   		printk(KERN_CONT "= %lukB\n", K(total));
> -	}
>   
> +		kfree(nr);
> +		kfree(types);
> +	}
> +skip_zone:
>   	hugetlb_show_meminfo();
>   
>   	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
> @@ -7429,8 +7442,8 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>   	lruvec_init(&pgdat->__lruvec);
>   }
>   
> -static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
> -							unsigned long remaining_pages)
> +static void __init zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
> +							unsigned long remaining_pages, bool hotplug)
>   {
>   	atomic_long_set(&zone->managed_pages, remaining_pages);
>   	zone_set_nid(zone, nid);
> @@ -7439,6 +7452,16 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx,
>   	spin_lock_init(&zone->lock);
>   	zone_seqlock_init(zone);
>   	zone_pcp_init(zone);
> +	if (hotplug)
> +		zone->free_area =
> +			kzalloc_node(sizeof(struct free_area) * MAX_ORDER,
> +				     GFP_KERNEL, nid);
> +	else
> +		zone->free_area =
> +			memblock_alloc_node(sizeof(struct free_area) * MAX_ORDER,
> +					    sizeof(struct free_area), nid);
> +	BUG_ON(!zone->free_area);
> +
>   }
>   
>   /*
> @@ -7456,7 +7479,18 @@ void __ref free_area_init_core_hotplug(int nid)
>   
>   	pgdat_init_internals(pgdat);
>   	for (z = 0; z < MAX_NR_ZONES; z++)
> -		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
> +		zone_init_internals(&pgdat->node_zones[z], z, nid, 0, true);
> +}
> +
> +void __ref free_area_deinit_core_hotplug(int nid)
> +{
> +	enum zone_type z;
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	for (z = 0; z < MAX_NR_ZONES; z++) {
> +		kfree(pgdat->node_zones[z].free_area);
> +		pgdat->node_zones[z].free_area = NULL;
> +	}
>   }
>   #endif
>   
> @@ -7519,7 +7553,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
>   		 * when the bootmem allocator frees pages into the buddy system.
>   		 * And all highmem pages will be managed by the buddy system.
>   		 */
> -		zone_init_internals(zone, j, nid, freesize);
> +		zone_init_internals(zone, j, nid, freesize, false);
>   
>   		if (!size)
>   			continue;
Zi Yan Aug. 5, 2021, 7:58 p.m. UTC | #2
On 5 Aug 2021, at 15:16, Christian König wrote:

> Am 05.08.21 um 21:02 schrieb Zi Yan:
>> From: Zi Yan <ziy@nvidia.com>
>>
>> This prepares for the upcoming changes to make MAX_ORDER a boot time
>> parameter instead of compilation time constant. All static arrays with
>> MAX_ORDER size are converted to pointers and their memory is allocated
>> at runtime.
>
> Well in general I strongly suggest to not use the patter kmalloc(sizeof(some struct) * MAX_ORDER,...) instead use kmalloc_array, kcalloc etc..
>
> Then when a array is embedded at the end of a structure you can use a trailing array and the struct_size() macro to determine the allocation size.

Sure. Will fix it.

>
> Additional to that separating the patch into changes for TTM to make the maximum allocation order independent from MAX_ORDER would be rather good to have I think.

Can you elaborate a little bit more on “make the maximum allocation order independent from MAX_ORDER”? From what I understand of ttm_pool_alloc(), it tries to get num_pages pages by allocating as large pages as possible, starting from MAX_ORDER. What is the rationale behind this algorithm? Why not just call alloc_page(order=0) num_pages times? Is it mean to reduce the number of calls to alloc_page? The allocated pages do not need to get as high as MAX_ORDER, is that the case? If yes, I probably can keep ttm pool as static arrays with length of MIN_MAX_ORDER, which I introduce in Patch 14 as the lower bound of boot time parameter MAX_ORDER. Let me know your thoughts.

Thanks.

—
Best Regards,
Yan, Zi
Christian König Aug. 6, 2021, 9:37 a.m. UTC | #3
Am 05.08.21 um 21:58 schrieb Zi Yan:
> On 5 Aug 2021, at 15:16, Christian König wrote:
>
>> Am 05.08.21 um 21:02 schrieb Zi Yan:
>>> From: Zi Yan <ziy@nvidia.com>
>>>
>>> This prepares for the upcoming changes to make MAX_ORDER a boot time
>>> parameter instead of compilation time constant. All static arrays with
>>> MAX_ORDER size are converted to pointers and their memory is allocated
>>> at runtime.
>> Well in general I strongly suggest to not use the patter kmalloc(sizeof(some struct) * MAX_ORDER,...) instead use kmalloc_array, kcalloc etc..
>>
>> Then when a array is embedded at the end of a structure you can use a trailing array and the struct_size() macro to determine the allocation size.
> Sure. Will fix it.
>
>> Additional to that separating the patch into changes for TTM to make the maximum allocation order independent from MAX_ORDER would be rather good to have I think.
> Can you elaborate a little bit more on “make the maximum allocation order independent from MAX_ORDER”?

My idea was that you have a single patch to give the maximum order as 
parameter to the TTM pool.

>  From what I understand of ttm_pool_alloc(), it tries to get num_pages pages by allocating as large pages as possible, starting from MAX_ORDER. What is the rationale behind this algorithm? Why not just call alloc_page(order=0) num_pages times?

What we do here is essentially transparent huge pages for GPUs.

In opposite to CPU which can usually only use fixed sizes like 4KiB, 
2MiB, 1GiB at least AMD GPUs can use any power of two.

So it makes sense to allocate big pages as much as possible and only 
fallback to 4KiB pages when necessary.

Down side is that we potentially exhaust larger orders for other use cases.

> Is it mean to reduce the number of calls to alloc_page?

That is a nice to have side effect, but the performance improvement for 
the TLB is the main reason here.

> The allocated pages do not need to get as high as MAX_ORDER, is that the case?

Actually we would really like to have pages as large as 1GiB for best 
TLB utilization :)

> If yes, I probably can keep ttm pool as static arrays with length of MIN_MAX_ORDER, which I introduce in Patch 14 as the lower bound of boot time parameter MAX_ORDER. Let me know your thoughts.

Well you could do something like MAX_MAX_ORDER with a value of at least 
19 (1GiB with 4KiB pages IIRC). And then limit to the real available 
max_order when you make that a run time option.

Since the array elements consists only of a linked list and a few extra 
parameters / pointers we probably won't need more than a page or two for 
those.

Regards,
Christian.

>
> Thanks.
>
> —
> Best Regards,
> Yan, Zi
Zi Yan Aug. 6, 2021, 2 p.m. UTC | #4
On 6 Aug 2021, at 5:37, Christian König wrote:

> Am 05.08.21 um 21:58 schrieb Zi Yan:
>> On 5 Aug 2021, at 15:16, Christian König wrote:
>>
>>> Am 05.08.21 um 21:02 schrieb Zi Yan:
>>>> From: Zi Yan <ziy@nvidia.com>
>>>>
>>>> This prepares for the upcoming changes to make MAX_ORDER a boot time
>>>> parameter instead of compilation time constant. All static arrays with
>>>> MAX_ORDER size are converted to pointers and their memory is allocated
>>>> at runtime.
>>> Well in general I strongly suggest to not use the patter kmalloc(sizeof(some struct) * MAX_ORDER,...) instead use kmalloc_array, kcalloc etc..
>>>
>>> Then when a array is embedded at the end of a structure you can use a trailing array and the struct_size() macro to determine the allocation size.
>> Sure. Will fix it.
>>
>>> Additional to that separating the patch into changes for TTM to make the maximum allocation order independent from MAX_ORDER would be rather good to have I think.
>> Can you elaborate a little bit more on “make the maximum allocation order independent from MAX_ORDER”?
>
> My idea was that you have a single patch to give the maximum order as parameter to the TTM pool.

Got it. No problem.
>
>>  From what I understand of ttm_pool_alloc(), it tries to get num_pages pages by allocating as large pages as possible, starting from MAX_ORDER. What is the rationale behind this algorithm? Why not just call alloc_page(order=0) num_pages times?
>
> What we do here is essentially transparent huge pages for GPUs.
>
> In opposite to CPU which can usually only use fixed sizes like 4KiB, 2MiB, 1GiB at least AMD GPUs can use any power of two.

FYI, Matthew Wilcox’s memory folio patchset adds support for arbitrary THP sizes[1]. You might want to check it out. :)

>
> So it makes sense to allocate big pages as much as possible and only fallback to 4KiB pages when necessary.
>
> Down side is that we potentially exhaust larger orders for other use cases.
>
>> Is it mean to reduce the number of calls to alloc_page?
>
> That is a nice to have side effect, but the performance improvement for the TLB is the main reason here.
>
>> The allocated pages do not need to get as high as MAX_ORDER, is that the case?
>
> Actually we would really like to have pages as large as 1GiB for best TLB utilization :)
>
>> If yes, I probably can keep ttm pool as static arrays with length of MIN_MAX_ORDER, which I introduce in Patch 14 as the lower bound of boot time parameter MAX_ORDER. Let me know your thoughts.
>
> Well you could do something like MAX_MAX_ORDER with a value of at least 19 (1GiB with 4KiB pages IIRC). And then limit to the real available max_order when you make that a run time option.
>
> Since the array elements consists only of a linked list and a few extra parameters / pointers we probably won't need more than a page or two for those.

Thanks for you explanation. Now I understand that ttm_pool does want pages as large as possible for performance reasons. I will keep the existing code, so that ttm_pool can get the largest pages from buddy allocator. I will separate the changes to TTM to a single patch like you suggested.


[1] https://lore.kernel.org/linux-mm/20210715033704.692967-1-willy@infradead.org

—
Best Regards,
Yan, Zi
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 3861a25faae1..1c9449b9458f 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -172,7 +172,7 @@  variables.
 Offset of the free_list's member. This value is used to compute the number
 of free pages.
 
-Each zone has a free_area structure array called free_area[MAX_ORDER].
+Each zone has a free_area structure array called free_area with length of MAX_ORDER.
 The free_list represents a linked list of free page blocks.
 
 (list_head, next|prev)
diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 74e3b460132b..7d994c03fbd0 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -94,7 +94,9 @@  static int ttm_global_init(void)
 		>> PAGE_SHIFT;
 	num_dma32 = min(num_dma32, 2UL << (30 - PAGE_SHIFT));
 
-	ttm_pool_mgr_init(num_pages);
+	ret = ttm_pool_mgr_init(num_pages);
+	if (ret)
+		goto out;
 	ttm_tt_mgr_init(num_pages, num_dma32);
 
 	glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32);
@@ -216,7 +218,8 @@  int ttm_device_init(struct ttm_device *bdev, struct ttm_device_funcs *funcs,
 	bdev->funcs = funcs;
 
 	ttm_sys_man_init(bdev);
-	ttm_pool_init(&bdev->pool, dev, use_dma_alloc, use_dma32);
+	if (ttm_pool_init(&bdev->pool, dev, use_dma_alloc, use_dma32))
+		return -ENOMEM;
 
 	bdev->vma_manager = vma_manager;
 	INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue);
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index cb38b1a17b09..ae20c80f14a4 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -64,11 +64,11 @@  module_param(page_pool_size, ulong, 0644);
 
 static atomic_long_t allocated_pages;
 
-static struct ttm_pool_type global_write_combined[MAX_ORDER];
-static struct ttm_pool_type global_uncached[MAX_ORDER];
+static struct ttm_pool_type *global_write_combined;
+static struct ttm_pool_type *global_uncached;
 
-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER];
-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER];
+static struct ttm_pool_type *global_dma32_write_combined;
+static struct ttm_pool_type *global_dma32_uncached;
 
 static struct mutex shrinker_lock;
 static struct list_head shrinker_list;
@@ -493,8 +493,10 @@  EXPORT_SYMBOL(ttm_pool_free);
  * @use_dma32: true if GFP_DMA32 should be used
  *
  * Initialize the pool and its pool types.
+ *
+ * Returns: 0 on successe, negative error code otherwise
  */
-void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
+int ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 		   bool use_dma_alloc, bool use_dma32)
 {
 	unsigned int i, j;
@@ -506,11 +508,30 @@  void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 	pool->use_dma32 = use_dma32;
 
 	if (use_dma_alloc) {
-		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
+		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
+			pool->caching[i].orders =
+				kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
+					GFP_KERNEL);
+			if (!pool->caching[i].orders) {
+				i--;
+				goto failed;
+			}
 			for (j = 0; j < MAX_ORDER; ++j)
 				ttm_pool_type_init(&pool->caching[i].orders[j],
 						   pool, i, j);
+
+		}
+		return 0;
+
+failed:
+		for (; i >= 0; i--) {
+			for (j = 0; j < MAX_ORDER; ++j)
+				ttm_pool_type_fini(&pool->caching[i].orders[j]);
+			kfree(pool->caching[i].orders);
+		}
+		return -ENOMEM;
 	}
+	return 0;
 }
 
 /**
@@ -696,6 +717,31 @@  int ttm_pool_mgr_init(unsigned long num_pages)
 	mutex_init(&shrinker_lock);
 	INIT_LIST_HEAD(&shrinker_list);
 
+	if (!global_write_combined) {
+		global_write_combined = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
+						GFP_KERNEL);
+		if (!global_write_combined)
+			return -ENOMEM;
+	}
+	if (!global_uncached) {
+		global_uncached = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
+					  GFP_KERNEL);
+		if (!global_uncached)
+			return -ENOMEM;
+	}
+	if (!global_dma32_write_combined) {
+		global_dma32_write_combined = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
+						      GFP_KERNEL);
+		if (!global_dma32_write_combined)
+			return -ENOMEM;
+	}
+	if (!global_dma32_uncached) {
+		global_dma32_uncached = kzalloc(sizeof(struct ttm_pool_type) * MAX_ORDER,
+						GFP_KERNEL);
+		if (!global_dma32_uncached)
+			return -ENOMEM;
+	}
+
 	for (i = 0; i < MAX_ORDER; ++i) {
 		ttm_pool_type_init(&global_write_combined[i], NULL,
 				   ttm_write_combined, i);
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index 4321728bdd11..5c09e3cf63ce 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -71,7 +71,7 @@  struct ttm_pool {
 	bool use_dma32;
 
 	struct {
-		struct ttm_pool_type orders[MAX_ORDER];
+		struct ttm_pool_type *orders;
 	} caching[TTM_NUM_CACHING_TYPES];
 };
 
@@ -79,7 +79,7 @@  int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
 		   struct ttm_operation_ctx *ctx);
 void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt);
 
-void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
+int ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 		   bool use_dma_alloc, bool use_dma32);
 void ttm_pool_fini(struct ttm_pool *pool);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 97f874a60607..c16aa66db61e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -326,6 +326,7 @@  extern void clear_zone_contiguous(struct zone *zone);
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 extern void __ref free_area_init_core_hotplug(int nid);
+extern void __ref free_area_deinit_core_hotplug(int nid);
 extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
 extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
 extern int add_memory_resource(int nid, struct resource *resource,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 322b995942e5..09aafc05aef4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -609,7 +609,7 @@  struct zone {
 	ZONE_PADDING(_pad1_)
 
 	/* free areas of different sizes */
-	struct free_area	free_area[MAX_ORDER];
+	struct free_area	*free_area;
 
 	/* zone flags, see below */
 	unsigned long		flags;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 91ca751ac20c..4ce20b6482aa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1239,6 +1239,7 @@  static void rollback_node_hotadd(int nid)
 
 	arch_refresh_nodedata(nid, NULL);
 	free_percpu(pgdat->per_cpu_nodestats);
+	free_area_deinit_core_hotplug(nid);
 	arch_free_nodedata(pgdat);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e4657009fd4f..bfa6962f7615 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6053,11 +6053,21 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 
 	for_each_populated_zone(zone) {
 		unsigned int order;
-		unsigned long nr[MAX_ORDER], flags, total = 0;
-		unsigned char types[MAX_ORDER];
+		unsigned long *nr, flags, total = 0;
+		unsigned char *types;
 
 		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
+
+		nr = kmalloc(sizeof(unsigned long) * MAX_ORDER, GFP_KERNEL);
+		if (!nr)
+			goto skip_zone;
+		types = kmalloc(sizeof(unsigned char) * MAX_ORDER, GFP_KERNEL);
+		if (!types) {
+			kfree(nr);
+			goto skip_zone;
+		}
+
 		show_node(zone);
 		printk(KERN_CONT "%s: ", zone->name);
 
@@ -6083,8 +6093,11 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 				show_migration_types(types[order]);
 		}
 		printk(KERN_CONT "= %lukB\n", K(total));
-	}
 
+		kfree(nr);
+		kfree(types);
+	}
+skip_zone:
 	hugetlb_show_meminfo();
 
 	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
@@ -7429,8 +7442,8 @@  static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 	lruvec_init(&pgdat->__lruvec);
 }
 
-static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
-							unsigned long remaining_pages)
+static void __init zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+							unsigned long remaining_pages, bool hotplug)
 {
 	atomic_long_set(&zone->managed_pages, remaining_pages);
 	zone_set_nid(zone, nid);
@@ -7439,6 +7452,16 @@  static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx,
 	spin_lock_init(&zone->lock);
 	zone_seqlock_init(zone);
 	zone_pcp_init(zone);
+	if (hotplug)
+		zone->free_area =
+			kzalloc_node(sizeof(struct free_area) * MAX_ORDER,
+				     GFP_KERNEL, nid);
+	else
+		zone->free_area =
+			memblock_alloc_node(sizeof(struct free_area) * MAX_ORDER,
+					    sizeof(struct free_area), nid);
+	BUG_ON(!zone->free_area);
+
 }
 
 /*
@@ -7456,7 +7479,18 @@  void __ref free_area_init_core_hotplug(int nid)
 
 	pgdat_init_internals(pgdat);
 	for (z = 0; z < MAX_NR_ZONES; z++)
-		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+		zone_init_internals(&pgdat->node_zones[z], z, nid, 0, true);
+}
+
+void __ref free_area_deinit_core_hotplug(int nid)
+{
+	enum zone_type z;
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	for (z = 0; z < MAX_NR_ZONES; z++) {
+		kfree(pgdat->node_zones[z].free_area);
+		pgdat->node_zones[z].free_area = NULL;
+	}
 }
 #endif
 
@@ -7519,7 +7553,7 @@  static void __init free_area_init_core(struct pglist_data *pgdat)
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
-		zone_init_internals(zone, j, nid, freesize);
+		zone_init_internals(zone, j, nid, freesize, false);
 
 		if (!size)
 			continue;