diff mbox

[v2,3/4] mm: add find_alloc_contig_pages() interface

Message ID 20180503232935.22539-4-mike.kravetz@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mike Kravetz May 3, 2018, 11:29 p.m. UTC
find_alloc_contig_pages() is a new interface that attempts to locate
and allocate a contiguous range of pages.  It is provided as a more
convenient interface than alloc_contig_range() which is currently
used by CMA and gigantic huge pages.

When attempting to allocate a range of pages, migration is employed
if possible.  There is no guarantee that the routine will succeed.
So, the user must be prepared for failure and have a fall back plan.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 include/linux/gfp.h |  12 +++++
 mm/page_alloc.c     | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 146 insertions(+), 2 deletions(-)

Comments

Vlastimil Babka May 21, 2018, 8:54 a.m. UTC | #1
On 05/04/2018 01:29 AM, Mike Kravetz wrote:
> find_alloc_contig_pages() is a new interface that attempts to locate
> and allocate a contiguous range of pages.  It is provided as a more

How about dropping the 'find_' from the name, so it's more like other
allocator functions? All of them have to 'find' the free pages in some
sense.

> convenient interface than alloc_contig_range() which is currently
> used by CMA and gigantic huge pages.
> 
> When attempting to allocate a range of pages, migration is employed
> if possible.  There is no guarantee that the routine will succeed.
> So, the user must be prepared for failure and have a fall back plan.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  include/linux/gfp.h |  12 +++++
>  mm/page_alloc.c     | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 146 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 86a0d06463ab..b0d11777d487 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -573,6 +573,18 @@ static inline bool pm_suspended_storage(void)
>  extern int alloc_contig_range(unsigned long start, unsigned long end,
>  			      unsigned migratetype, gfp_t gfp_mask);
>  extern void free_contig_range(unsigned long pfn, unsigned long nr_pages);
> +extern struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
> +						int nid, nodemask_t *nodemask);
> +extern void free_contig_pages(struct page *page, unsigned long nr_pages);
> +#else
> +static inline struct page *find_alloc_contig_pages(unsigned long nr_pages,
> +				gfp_t gfp, int nid, nodemask_t *nodemask)
> +{
> +	return NULL;
> +}
> +static inline void free_contig_pages(struct page *page, unsigned long nr_pages)
> +{
> +}
>  #endif
>  
>  #ifdef CONFIG_CMA
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index cb1a5e0be6ee..d0a2d0da9eae 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -67,6 +67,7 @@
>  #include <linux/ftrace.h>
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
> +#include <linux/mmzone.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -7913,8 +7914,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  
>  	/* Make sure the range is really isolated. */
>  	if (test_pages_isolated(outer_start, end, false)) {
> -		pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
> -			__func__, outer_start, end);
> +#ifdef MIGRATE_CMA
> +		/* Only print messages for CMA allocations */
> +		if (migratetype == MIGRATE_CMA)

I think is_migrate_cma() can be used to avoid the #ifdef.

> +			pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
> +				__func__, outer_start, end);
> +#endif
>  		ret = -EBUSY;
>  		goto done;
>  	}
> @@ -7950,6 +7955,133 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
>  	}
>  	WARN(count != 0, "%ld pages are still in use!\n", count);
>  }
> +
> +/*
> + * Only check for obvious pfn/pages which can not be used/migrated.  The
> + * migration code will do the final check.  Under stress, this minimal set
> + * has been observed to provide the best results.  The checks can be expanded
> + * if needed.

Hm I kind of doubt this is optimal, it doesn't test almost anything
besides basic validity, so it won't exclude ranges where the allocation
will fail. I will write more in a reply to the header where complexity
is discussed.

> + */
> +static bool contig_pfn_range_valid(struct zone *z, unsigned long start_pfn,
> +					unsigned long nr_pages)
> +{
> +	unsigned long i, end_pfn = start_pfn + nr_pages;
> +	struct page *page;
> +
> +	for (i = start_pfn; i < end_pfn; i++) {
> +		if (!pfn_valid(i))
> +			return false;
> +
> +		page = pfn_to_online_page(i);
> +
> +		if (page_zone(page) != z)
> +			return false;
> +
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * Search for and attempt to allocate contiguous allocations greater than
> + * MAX_ORDER.
> + */
> +static struct page *__alloc_contig_pages_nodemask(gfp_t gfp,
> +						unsigned long order,
> +						int nid, nodemask_t *nodemask)
> +{
> +	unsigned long nr_pages, pfn, flags;
> +	struct page *ret_page = NULL;
> +	struct zonelist *zonelist;
> +	struct zoneref *z;
> +	struct zone *zone;
> +	int rc;
> +
> +	nr_pages = 1 << order;
> +	zonelist = node_zonelist(nid, gfp);
> +	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp),
> +					nodemask) {
> +		pgdat_resize_lock(zone->zone_pgdat, &flags);
> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> +		while (zone_spans_pfn(zone, pfn + nr_pages - 1)) {
> +			if (contig_pfn_range_valid(zone, pfn, nr_pages)) {
> +				struct page *page = pfn_to_online_page(pfn);
> +				unsigned int migratetype;
> +
> +				/*
> +				 * All pageblocks in range must be of same
> +				 * migrate type.
> +				 */
> +				migratetype = get_pageblock_migratetype(page);
> +				pgdat_resize_unlock(zone->zone_pgdat, &flags);
> +
> +				rc = alloc_contig_range(pfn, pfn + nr_pages,
> +						migratetype, gfp);
> +				if (!rc) {
> +					ret_page = pfn_to_page(pfn);
> +					return ret_page;
> +				}
> +				pgdat_resize_lock(zone->zone_pgdat, &flags);
> +			}
> +			pfn += nr_pages;
> +		}
> +		pgdat_resize_unlock(zone->zone_pgdat, &flags);
> +	}
> +
> +	return ret_page;
> +}
> +
> +/**
> + * find_alloc_contig_pages() -- attempt to find and allocate a contiguous
> + *				range of pages
> + * @nr_pages:	number of pages to find/allocate
> + * @gfp:	gfp mask used to limit search as well as during compaction
> + * @nid:	target node
> + * @nodemask:	mask of other possible nodes
> + *
> + * Pages can be freed with a call to free_contig_pages(), or by manually
> + * calling __free_page() for each page allocated.
> + *
> + * Return: pointer to 'order' pages on success, or NULL if not successful.
> + */
> +struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
> +					int nid, nodemask_t *nodemask)
> +{
> +	unsigned long i, alloc_order, order_pages;
> +	struct page *pages;
> +
> +	/*
> +	 * Underlying allocators perform page order sized allocations.
> +	 */
> +	alloc_order = get_count_order(nr_pages);

So if takes arbitrary nr_pages but convert it to order anyway? I think
that's rather suboptimal and wasteful... e.g. a range could be skipped
because some of the pages added by rounding cannot be migrated away.

Vlastimil

> +	if (alloc_order < MAX_ORDER) {
> +		pages = __alloc_pages_nodemask(gfp, (unsigned int)alloc_order,
> +						nid, nodemask);
> +		split_page(pages, alloc_order);
> +	} else {
> +		pages = __alloc_contig_pages_nodemask(gfp, alloc_order, nid,
> +							nodemask);
> +	}
> +
> +	if (pages) {
> +		/*
> +		 * More pages than desired could have been allocated due to
> +		 * rounding up to next page order.  Free any excess pages.
> +		 */
> +		order_pages = 1UL << alloc_order;
> +		for (i = nr_pages; i < order_pages; i++)
> +			__free_page(pages + i);
> +	}
> +
> +	return pages;
> +}
> +EXPORT_SYMBOL_GPL(find_alloc_contig_pages);
> +
> +void free_contig_pages(struct page *page, unsigned long nr_pages)
> +{
> +	free_contig_range(page_to_pfn(page), nr_pages);
> +}
> +EXPORT_SYMBOL_GPL(free_contig_pages);
>  #endif
>  
>  #if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA
>
Mike Kravetz May 21, 2018, 11:48 p.m. UTC | #2
On 05/21/2018 01:54 AM, Vlastimil Babka wrote:
> On 05/04/2018 01:29 AM, Mike Kravetz wrote:
>> find_alloc_contig_pages() is a new interface that attempts to locate
>> and allocate a contiguous range of pages.  It is provided as a more
> 
> How about dropping the 'find_' from the name, so it's more like other
> allocator functions? All of them have to 'find' the free pages in some
> sense.

Sure

> 
>> convenient interface than alloc_contig_range() which is currently
>> used by CMA and gigantic huge pages.
>>
>> When attempting to allocate a range of pages, migration is employed
>> if possible.  There is no guarantee that the routine will succeed.
>> So, the user must be prepared for failure and have a fall back plan.
>>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>  include/linux/gfp.h |  12 +++++
>>  mm/page_alloc.c     | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 146 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>> index 86a0d06463ab..b0d11777d487 100644
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -573,6 +573,18 @@ static inline bool pm_suspended_storage(void)
>>  extern int alloc_contig_range(unsigned long start, unsigned long end,
>>  			      unsigned migratetype, gfp_t gfp_mask);
>>  extern void free_contig_range(unsigned long pfn, unsigned long nr_pages);
>> +extern struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
>> +						int nid, nodemask_t *nodemask);
>> +extern void free_contig_pages(struct page *page, unsigned long nr_pages);
>> +#else
>> +static inline struct page *find_alloc_contig_pages(unsigned long nr_pages,
>> +				gfp_t gfp, int nid, nodemask_t *nodemask)
>> +{
>> +	return NULL;
>> +}
>> +static inline void free_contig_pages(struct page *page, unsigned long nr_pages)
>> +{
>> +}
>>  #endif
>>  
>>  #ifdef CONFIG_CMA
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index cb1a5e0be6ee..d0a2d0da9eae 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -67,6 +67,7 @@
>>  #include <linux/ftrace.h>
>>  #include <linux/lockdep.h>
>>  #include <linux/nmi.h>
>> +#include <linux/mmzone.h>
>>  
>>  #include <asm/sections.h>
>>  #include <asm/tlbflush.h>
>> @@ -7913,8 +7914,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>  
>>  	/* Make sure the range is really isolated. */
>>  	if (test_pages_isolated(outer_start, end, false)) {
>> -		pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
>> -			__func__, outer_start, end);
>> +#ifdef MIGRATE_CMA
>> +		/* Only print messages for CMA allocations */
>> +		if (migratetype == MIGRATE_CMA)
> 
> I think is_migrate_cma() can be used to avoid the #ifdef.
> 

Thanks.  I missed that and did not want to create something new.

>> +			pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
>> +				__func__, outer_start, end);
>> +#endif
>>  		ret = -EBUSY;
>>  		goto done;
>>  	}
>> @@ -7950,6 +7955,133 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
>>  	}
>>  	WARN(count != 0, "%ld pages are still in use!\n", count);
>>  }
>> +
>> +/*
>> + * Only check for obvious pfn/pages which can not be used/migrated.  The
>> + * migration code will do the final check.  Under stress, this minimal set
>> + * has been observed to provide the best results.  The checks can be expanded
>> + * if needed.
> 
> Hm I kind of doubt this is optimal, it doesn't test almost anything
> besides basic validity, so it won't exclude ranges where the allocation
> will fail. I will write more in a reply to the header where complexity
> is discussed.
> 

Ok.  This 'appeared' to work best in testing where I had all CPUs in tight
loops calling this new interface to allocate and then free contiguous pages.
I was somewhat surprised at the result, and it may just be due to the nature
of my testing.

>> + */
>> +static bool contig_pfn_range_valid(struct zone *z, unsigned long start_pfn,
>> +					unsigned long nr_pages)
>> +{
>> +	unsigned long i, end_pfn = start_pfn + nr_pages;
>> +	struct page *page;
>> +
>> +	for (i = start_pfn; i < end_pfn; i++) {
>> +		if (!pfn_valid(i))
>> +			return false;
>> +
>> +		page = pfn_to_online_page(i);
>> +
>> +		if (page_zone(page) != z)
>> +			return false;
>> +
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +/*
>> + * Search for and attempt to allocate contiguous allocations greater than
>> + * MAX_ORDER.
>> + */
>> +static struct page *__alloc_contig_pages_nodemask(gfp_t gfp,
>> +						unsigned long order,
>> +						int nid, nodemask_t *nodemask)
>> +{
>> +	unsigned long nr_pages, pfn, flags;
>> +	struct page *ret_page = NULL;
>> +	struct zonelist *zonelist;
>> +	struct zoneref *z;
>> +	struct zone *zone;
>> +	int rc;
>> +
>> +	nr_pages = 1 << order;
>> +	zonelist = node_zonelist(nid, gfp);
>> +	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp),
>> +					nodemask) {
>> +		pgdat_resize_lock(zone->zone_pgdat, &flags);
>> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
>> +		while (zone_spans_pfn(zone, pfn + nr_pages - 1)) {
>> +			if (contig_pfn_range_valid(zone, pfn, nr_pages)) {
>> +				struct page *page = pfn_to_online_page(pfn);
>> +				unsigned int migratetype;
>> +
>> +				/*
>> +				 * All pageblocks in range must be of same
>> +				 * migrate type.
>> +				 */
>> +				migratetype = get_pageblock_migratetype(page);
>> +				pgdat_resize_unlock(zone->zone_pgdat, &flags);
>> +
>> +				rc = alloc_contig_range(pfn, pfn + nr_pages,
>> +						migratetype, gfp);
>> +				if (!rc) {
>> +					ret_page = pfn_to_page(pfn);
>> +					return ret_page;
>> +				}
>> +				pgdat_resize_lock(zone->zone_pgdat, &flags);
>> +			}
>> +			pfn += nr_pages;
>> +		}
>> +		pgdat_resize_unlock(zone->zone_pgdat, &flags);
>> +	}
>> +
>> +	return ret_page;
>> +}
>> +
>> +/**
>> + * find_alloc_contig_pages() -- attempt to find and allocate a contiguous
>> + *				range of pages
>> + * @nr_pages:	number of pages to find/allocate
>> + * @gfp:	gfp mask used to limit search as well as during compaction
>> + * @nid:	target node
>> + * @nodemask:	mask of other possible nodes
>> + *
>> + * Pages can be freed with a call to free_contig_pages(), or by manually
>> + * calling __free_page() for each page allocated.
>> + *
>> + * Return: pointer to 'order' pages on success, or NULL if not successful.
>> + */
>> +struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
>> +					int nid, nodemask_t *nodemask)
>> +{
>> +	unsigned long i, alloc_order, order_pages;
>> +	struct page *pages;
>> +
>> +	/*
>> +	 * Underlying allocators perform page order sized allocations.
>> +	 */
>> +	alloc_order = get_count_order(nr_pages);
> 
> So if takes arbitrary nr_pages but convert it to order anyway? I think
> that's rather suboptimal and wasteful... e.g. a range could be skipped
> because some of the pages added by rounding cannot be migrated away.

Yes.  My idea with this series was to use existing allocators which are
all order based.  Let me think about how to do allocation for arbitrary
number of allocations.
- For less than MAX_ORDER size we rely on the buddy allocator, so we are
  pretty much stuck with order sized allocation.  However, allocations of
  this size are not really interesting as you can call existing routines
  directly.
- For sizes greater than MAX_ORDER, we know that the allocation size will
  be at least pageblock sized.  So, the isolate/migrate scheme can still
  be used for full pageblocks.  We can then use direct migration for the
  remaining pages.  This does complicate things a bit.

I'm guessing that most (?all?) allocations will be order based.  The use
cases I am aware of (hugetlbfs, Intel Cache Pseudo-Locking, RDMA) are all
order based.  However, as commented in previous version taking arbitrary
nr_pages makes interface more future proof.
Reinette Chatre May 22, 2018, 4:41 p.m. UTC | #3
On 5/21/2018 4:48 PM, Mike Kravetz wrote:
> On 05/21/2018 01:54 AM, Vlastimil Babka wrote:
>> On 05/04/2018 01:29 AM, Mike Kravetz wrote:
>>> +/**
>>> + * find_alloc_contig_pages() -- attempt to find and allocate a contiguous
>>> + *				range of pages
>>> + * @nr_pages:	number of pages to find/allocate
>>> + * @gfp:	gfp mask used to limit search as well as during compaction
>>> + * @nid:	target node
>>> + * @nodemask:	mask of other possible nodes
>>> + *
>>> + * Pages can be freed with a call to free_contig_pages(), or by manually
>>> + * calling __free_page() for each page allocated.
>>> + *
>>> + * Return: pointer to 'order' pages on success, or NULL if not successful.
>>> + */
>>> +struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
>>> +					int nid, nodemask_t *nodemask)
>>> +{
>>> +	unsigned long i, alloc_order, order_pages;
>>> +	struct page *pages;
>>> +
>>> +	/*
>>> +	 * Underlying allocators perform page order sized allocations.
>>> +	 */
>>> +	alloc_order = get_count_order(nr_pages);
>>
>> So if takes arbitrary nr_pages but convert it to order anyway? I think
>> that's rather suboptimal and wasteful... e.g. a range could be skipped
>> because some of the pages added by rounding cannot be migrated away.
> 
> Yes.  My idea with this series was to use existing allocators which are
> all order based.  Let me think about how to do allocation for arbitrary
> number of allocations.
> - For less than MAX_ORDER size we rely on the buddy allocator, so we are
>   pretty much stuck with order sized allocation.  However, allocations of
>   this size are not really interesting as you can call existing routines
>   directly.
> - For sizes greater than MAX_ORDER, we know that the allocation size will
>   be at least pageblock sized.  So, the isolate/migrate scheme can still
>   be used for full pageblocks.  We can then use direct migration for the
>   remaining pages.  This does complicate things a bit.
> 
> I'm guessing that most (?all?) allocations will be order based.  The use
> cases I am aware of (hugetlbfs, Intel Cache Pseudo-Locking, RDMA) are all
> order based.  However, as commented in previous version taking arbitrary
> nr_pages makes interface more future proof.
> 

I noticed this Cache Pseudo-Locking statement and would like to clarify.
I have not been following this thread in detail so I would like to
apologize first if my comments are out of context.

Currently the Cache Pseudo-Locking allocations are order based because I
assumed it was required by the allocator. The contiguous regions needed
by Cache Pseudo-Locking will not always be order based - instead it is
based on the granularity of the cache allocation. One example is a
platform with 55MB L3 cache that can be divided into 20 equal portions.
To support Cache Pseudo-Locking on this platform we need to be able to
allocate contiguous regions at increments of 2816KB (the size of each
portion). In support of this example platform regions needed would thus
be 2816KB, 5632KB, 8448KB, etc.

Regards,

Reinette
Mike Kravetz May 22, 2018, 8:35 p.m. UTC | #4
On 05/22/2018 09:41 AM, Reinette Chatre wrote:
> On 5/21/2018 4:48 PM, Mike Kravetz wrote:
>> On 05/21/2018 01:54 AM, Vlastimil Babka wrote:
>>> On 05/04/2018 01:29 AM, Mike Kravetz wrote:
>>>> +/**
>>>> + * find_alloc_contig_pages() -- attempt to find and allocate a contiguous
>>>> + *				range of pages
>>>> + * @nr_pages:	number of pages to find/allocate
>>>> + * @gfp:	gfp mask used to limit search as well as during compaction
>>>> + * @nid:	target node
>>>> + * @nodemask:	mask of other possible nodes
>>>> + *
>>>> + * Pages can be freed with a call to free_contig_pages(), or by manually
>>>> + * calling __free_page() for each page allocated.
>>>> + *
>>>> + * Return: pointer to 'order' pages on success, or NULL if not successful.
>>>> + */
>>>> +struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
>>>> +					int nid, nodemask_t *nodemask)
>>>> +{
>>>> +	unsigned long i, alloc_order, order_pages;
>>>> +	struct page *pages;
>>>> +
>>>> +	/*
>>>> +	 * Underlying allocators perform page order sized allocations.
>>>> +	 */
>>>> +	alloc_order = get_count_order(nr_pages);
>>>
>>> So if takes arbitrary nr_pages but convert it to order anyway? I think
>>> that's rather suboptimal and wasteful... e.g. a range could be skipped
>>> because some of the pages added by rounding cannot be migrated away.
>>
>> Yes.  My idea with this series was to use existing allocators which are
>> all order based.  Let me think about how to do allocation for arbitrary
>> number of allocations.
>> - For less than MAX_ORDER size we rely on the buddy allocator, so we are
>>   pretty much stuck with order sized allocation.  However, allocations of
>>   this size are not really interesting as you can call existing routines
>>   directly.
>> - For sizes greater than MAX_ORDER, we know that the allocation size will
>>   be at least pageblock sized.  So, the isolate/migrate scheme can still
>>   be used for full pageblocks.  We can then use direct migration for the
>>   remaining pages.  This does complicate things a bit.
>>
>> I'm guessing that most (?all?) allocations will be order based.  The use
>> cases I am aware of (hugetlbfs, Intel Cache Pseudo-Locking, RDMA) are all
>> order based.  However, as commented in previous version taking arbitrary
>> nr_pages makes interface more future proof.
>>
> 
> I noticed this Cache Pseudo-Locking statement and would like to clarify.
> I have not been following this thread in detail so I would like to
> apologize first if my comments are out of context.
> 
> Currently the Cache Pseudo-Locking allocations are order based because I
> assumed it was required by the allocator. The contiguous regions needed
> by Cache Pseudo-Locking will not always be order based - instead it is
> based on the granularity of the cache allocation. One example is a
> platform with 55MB L3 cache that can be divided into 20 equal portions.
> To support Cache Pseudo-Locking on this platform we need to be able to
> allocate contiguous regions at increments of 2816KB (the size of each
> portion). In support of this example platform regions needed would thus
> be 2816KB, 5632KB, 8448KB, etc.

Thank you Reinette.  I was not aware of these details.  Yours is the most
concrete new use case.

This certainly makes more of a case for arbitrary sized allocations.
Vlastimil Babka May 23, 2018, 11:18 a.m. UTC | #5
On 05/22/2018 06:41 PM, Reinette Chatre wrote:
> On 5/21/2018 4:48 PM, Mike Kravetz wrote:
>> I'm guessing that most (?all?) allocations will be order based.  The use
>> cases I am aware of (hugetlbfs, Intel Cache Pseudo-Locking, RDMA) are all
>> order based.  However, as commented in previous version taking arbitrary
>> nr_pages makes interface more future proof.
>>
> 
> I noticed this Cache Pseudo-Locking statement and would like to clarify.
> I have not been following this thread in detail so I would like to
> apologize first if my comments are out of context.
> 
> Currently the Cache Pseudo-Locking allocations are order based because I
> assumed it was required by the allocator. The contiguous regions needed
> by Cache Pseudo-Locking will not always be order based - instead it is
> based on the granularity of the cache allocation. One example is a
> platform with 55MB L3 cache that can be divided into 20 equal portions.
> To support Cache Pseudo-Locking on this platform we need to be able to
> allocate contiguous regions at increments of 2816KB (the size of each
> portion). In support of this example platform regions needed would thus
> be 2816KB, 5632KB, 8448KB, etc.

Will there be any alignment requirements for these allocations e.g. for
minimizing conflict misses?

Vlastimil
Reinette Chatre May 23, 2018, 6:07 p.m. UTC | #6
Hi Vlastimil,

On 5/23/2018 4:18 AM, Vlastimil Babka wrote:
> On 05/22/2018 06:41 PM, Reinette Chatre wrote:
>> On 5/21/2018 4:48 PM, Mike Kravetz wrote:
>>> I'm guessing that most (?all?) allocations will be order based.  The use
>>> cases I am aware of (hugetlbfs, Intel Cache Pseudo-Locking, RDMA) are all
>>> order based.  However, as commented in previous version taking arbitrary
>>> nr_pages makes interface more future proof.
>>>
>>
>> I noticed this Cache Pseudo-Locking statement and would like to clarify.
>> I have not been following this thread in detail so I would like to
>> apologize first if my comments are out of context.
>>
>> Currently the Cache Pseudo-Locking allocations are order based because I
>> assumed it was required by the allocator. The contiguous regions needed
>> by Cache Pseudo-Locking will not always be order based - instead it is
>> based on the granularity of the cache allocation. One example is a
>> platform with 55MB L3 cache that can be divided into 20 equal portions.
>> To support Cache Pseudo-Locking on this platform we need to be able to
>> allocate contiguous regions at increments of 2816KB (the size of each
>> portion). In support of this example platform regions needed would thus
>> be 2816KB, 5632KB, 8448KB, etc.
> 
> Will there be any alignment requirements for these allocations e.g. for
> minimizing conflict misses?

Two views on the usage of the allocated memory are: On the user space
side, the kernel memory is mapped to userspace (using remap_pfn_range())
and thus need to be page aligned. On the kernel side the memory is
loaded into the cache and it is here where the requirement originates
for it to be contiguous. The memory being contiguous reduces the
likelihood of physical addresses from the allocated memory mapping to
the same cache line and thus cause cache evictions of memory we are
trying to load into the cache.

I hope I answered your question, if not, please let me know which parts
I missed and I will try again.

Reinette
Vlastimil Babka May 28, 2018, 1:12 p.m. UTC | #7
On 05/23/2018 08:07 PM, Reinette Chatre wrote:
> On 5/23/2018 4:18 AM, Vlastimil Babka wrote:
>> On 05/22/2018 06:41 PM, Reinette Chatre wrote:
>>> Currently the Cache Pseudo-Locking allocations are order based because I
>>> assumed it was required by the allocator. The contiguous regions needed
>>> by Cache Pseudo-Locking will not always be order based - instead it is
>>> based on the granularity of the cache allocation. One example is a
>>> platform with 55MB L3 cache that can be divided into 20 equal portions.
>>> To support Cache Pseudo-Locking on this platform we need to be able to
>>> allocate contiguous regions at increments of 2816KB (the size of each
>>> portion). In support of this example platform regions needed would thus
>>> be 2816KB, 5632KB, 8448KB, etc.
>>
>> Will there be any alignment requirements for these allocations e.g. for
>> minimizing conflict misses?
> 
> Two views on the usage of the allocated memory are: On the user space
> side, the kernel memory is mapped to userspace (using remap_pfn_range())
> and thus need to be page aligned. On the kernel side the memory is
> loaded into the cache and it is here where the requirement originates
> for it to be contiguous. The memory being contiguous reduces the
> likelihood of physical addresses from the allocated memory mapping to
> the same cache line and thus cause cache evictions of memory we are
> trying to load into the cache.

Hi, yeah that's what I've been thinking, and I guess page alignment is
enough for that after all. I'm just not used to cache sizes and ways
that are not power of two :)

> I hope I answered your question, if not, please let me know which parts
> I missed and I will try again.

Thanks!

Vlastimil

> Reinette
>
diff mbox

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 86a0d06463ab..b0d11777d487 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -573,6 +573,18 @@  static inline bool pm_suspended_storage(void)
 extern int alloc_contig_range(unsigned long start, unsigned long end,
 			      unsigned migratetype, gfp_t gfp_mask);
 extern void free_contig_range(unsigned long pfn, unsigned long nr_pages);
+extern struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
+						int nid, nodemask_t *nodemask);
+extern void free_contig_pages(struct page *page, unsigned long nr_pages);
+#else
+static inline struct page *find_alloc_contig_pages(unsigned long nr_pages,
+				gfp_t gfp, int nid, nodemask_t *nodemask)
+{
+	return NULL;
+}
+static inline void free_contig_pages(struct page *page, unsigned long nr_pages)
+{
+}
 #endif
 
 #ifdef CONFIG_CMA
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb1a5e0be6ee..d0a2d0da9eae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -67,6 +67,7 @@ 
 #include <linux/ftrace.h>
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
+#include <linux/mmzone.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -7913,8 +7914,12 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
-		pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
-			__func__, outer_start, end);
+#ifdef MIGRATE_CMA
+		/* Only print messages for CMA allocations */
+		if (migratetype == MIGRATE_CMA)
+			pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
+				__func__, outer_start, end);
+#endif
 		ret = -EBUSY;
 		goto done;
 	}
@@ -7950,6 +7955,133 @@  void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 	}
 	WARN(count != 0, "%ld pages are still in use!\n", count);
 }
+
+/*
+ * Only check for obvious pfn/pages which can not be used/migrated.  The
+ * migration code will do the final check.  Under stress, this minimal set
+ * has been observed to provide the best results.  The checks can be expanded
+ * if needed.
+ */
+static bool contig_pfn_range_valid(struct zone *z, unsigned long start_pfn,
+					unsigned long nr_pages)
+{
+	unsigned long i, end_pfn = start_pfn + nr_pages;
+	struct page *page;
+
+	for (i = start_pfn; i < end_pfn; i++) {
+		if (!pfn_valid(i))
+			return false;
+
+		page = pfn_to_online_page(i);
+
+		if (page_zone(page) != z)
+			return false;
+
+	}
+
+	return true;
+}
+
+/*
+ * Search for and attempt to allocate contiguous allocations greater than
+ * MAX_ORDER.
+ */
+static struct page *__alloc_contig_pages_nodemask(gfp_t gfp,
+						unsigned long order,
+						int nid, nodemask_t *nodemask)
+{
+	unsigned long nr_pages, pfn, flags;
+	struct page *ret_page = NULL;
+	struct zonelist *zonelist;
+	struct zoneref *z;
+	struct zone *zone;
+	int rc;
+
+	nr_pages = 1 << order;
+	zonelist = node_zonelist(nid, gfp);
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp),
+					nodemask) {
+		pgdat_resize_lock(zone->zone_pgdat, &flags);
+		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+		while (zone_spans_pfn(zone, pfn + nr_pages - 1)) {
+			if (contig_pfn_range_valid(zone, pfn, nr_pages)) {
+				struct page *page = pfn_to_online_page(pfn);
+				unsigned int migratetype;
+
+				/*
+				 * All pageblocks in range must be of same
+				 * migrate type.
+				 */
+				migratetype = get_pageblock_migratetype(page);
+				pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+				rc = alloc_contig_range(pfn, pfn + nr_pages,
+						migratetype, gfp);
+				if (!rc) {
+					ret_page = pfn_to_page(pfn);
+					return ret_page;
+				}
+				pgdat_resize_lock(zone->zone_pgdat, &flags);
+			}
+			pfn += nr_pages;
+		}
+		pgdat_resize_unlock(zone->zone_pgdat, &flags);
+	}
+
+	return ret_page;
+}
+
+/**
+ * find_alloc_contig_pages() -- attempt to find and allocate a contiguous
+ *				range of pages
+ * @nr_pages:	number of pages to find/allocate
+ * @gfp:	gfp mask used to limit search as well as during compaction
+ * @nid:	target node
+ * @nodemask:	mask of other possible nodes
+ *
+ * Pages can be freed with a call to free_contig_pages(), or by manually
+ * calling __free_page() for each page allocated.
+ *
+ * Return: pointer to 'order' pages on success, or NULL if not successful.
+ */
+struct page *find_alloc_contig_pages(unsigned long nr_pages, gfp_t gfp,
+					int nid, nodemask_t *nodemask)
+{
+	unsigned long i, alloc_order, order_pages;
+	struct page *pages;
+
+	/*
+	 * Underlying allocators perform page order sized allocations.
+	 */
+	alloc_order = get_count_order(nr_pages);
+	if (alloc_order < MAX_ORDER) {
+		pages = __alloc_pages_nodemask(gfp, (unsigned int)alloc_order,
+						nid, nodemask);
+		split_page(pages, alloc_order);
+	} else {
+		pages = __alloc_contig_pages_nodemask(gfp, alloc_order, nid,
+							nodemask);
+	}
+
+	if (pages) {
+		/*
+		 * More pages than desired could have been allocated due to
+		 * rounding up to next page order.  Free any excess pages.
+		 */
+		order_pages = 1UL << alloc_order;
+		for (i = nr_pages; i < order_pages; i++)
+			__free_page(pages + i);
+	}
+
+	return pages;
+}
+EXPORT_SYMBOL_GPL(find_alloc_contig_pages);
+
+void free_contig_pages(struct page *page, unsigned long nr_pages)
+{
+	free_contig_range(page_to_pfn(page), nr_pages);
+}
+EXPORT_SYMBOL_GPL(free_contig_pages);
 #endif
 
 #if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA