diff mbox series

mm/page_alloc: Make alloc_gigantic_page() available for general use

Message ID 1571211293-29974-1-git-send-email-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series mm/page_alloc: Make alloc_gigantic_page() available for general use | expand

Commit Message

Anshuman Khandual Oct. 16, 2019, 7:34 a.m. UTC
HugeTLB helper alloc_gigantic_page() implements fairly generic allocation
method where it scans over various zones looking for a large contiguous pfn
range before trying to allocate it with alloc_contig_range(). Other than
deriving the requested order from 'struct hstate', there is nothing HugeTLB
specific in there. This can be made available for general use to allocate
contiguous memory which could not have been allocated through the buddy
allocator.

alloc_gigantic_page() has been split carving out actual allocation method
which is then made available via new alloc_contig_pages() helper wrapped
under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced
with more generic term 'contig'. Allocated pages here should be freed with
free_contig_range() or by calling __free_page() on each allocated page.

Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
This is based on https://patchwork.kernel.org/patch/11190213/

Changes from [V5,1/2] mm/hugetlb: Make alloc_gigantic_page()...

- alloc_contig_page() takes nr_pages instead of order per Michal
- s/gigantic/contig on all related functions

 include/linux/gfp.h |  2 +
 mm/hugetlb.c        | 77 +----------------------------------
 mm/page_alloc.c     | 97 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+), 75 deletions(-)

Comments

David Hildenbrand Oct. 16, 2019, 8:08 a.m. UTC | #1
On 16.10.19 09:34, Anshuman Khandual wrote:
> HugeTLB helper alloc_gigantic_page() implements fairly generic allocation
> method where it scans over various zones looking for a large contiguous pfn
> range before trying to allocate it with alloc_contig_range(). Other than
> deriving the requested order from 'struct hstate', there is nothing HugeTLB
> specific in there. This can be made available for general use to allocate
> contiguous memory which could not have been allocated through the buddy
> allocator.
> 
> alloc_gigantic_page() has been split carving out actual allocation method
> which is then made available via new alloc_contig_pages() helper wrapped
> under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced
> with more generic term 'contig'. Allocated pages here should be freed with
> free_contig_range() or by calling __free_page() on each allocated page.
> 
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Mike Rapoport <rppt@linux.ibm.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> This is based on https://patchwork.kernel.org/patch/11190213/
> 
> Changes from [V5,1/2] mm/hugetlb: Make alloc_gigantic_page()...
> 
> - alloc_contig_page() takes nr_pages instead of order per Michal
> - s/gigantic/contig on all related functions
> 
>   include/linux/gfp.h |  2 +
>   mm/hugetlb.c        | 77 +----------------------------------
>   mm/page_alloc.c     | 97 +++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 101 insertions(+), 75 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index fb07b503dc45..1a11d4857027 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -589,6 +589,8 @@ static inline bool pm_suspended_storage(void)
>   /* The below functions must be run on a range from a single zone. */
>   extern int alloc_contig_range(unsigned long start, unsigned long end,
>   			      unsigned migratetype, gfp_t gfp_mask);
> +extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
> +				       int nid, nodemask_t *nodemask);
>   #endif
>   void free_contig_range(unsigned long pfn, unsigned int nr_pages);
>   
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 985ee15eb04b..a5c2c880af27 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1023,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
>   }
>   
>   #ifdef CONFIG_CONTIG_ALLOC
> -static int __alloc_gigantic_page(unsigned long start_pfn,
> -				unsigned long nr_pages, gfp_t gfp_mask)
> -{
> -	unsigned long end_pfn = start_pfn + nr_pages;
> -	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
> -				  gfp_mask);
> -}
> -
> -static bool pfn_range_valid_gigantic(struct zone *z,
> -			unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	unsigned long i, end_pfn = start_pfn + nr_pages;
> -	struct page *page;
> -
> -	for (i = start_pfn; i < end_pfn; i++) {
> -		page = pfn_to_online_page(i);
> -		if (!page)
> -			return false;
> -
> -		if (page_zone(page) != z)
> -			return false;
> -
> -		if (PageReserved(page))
> -			return false;
> -
> -		if (page_count(page) > 0)
> -			return false;
> -
> -		if (PageHuge(page))
> -			return false;
> -	}
> -
> -	return true;
> -}
> -
> -static bool zone_spans_last_pfn(const struct zone *zone,
> -			unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	unsigned long last_pfn = start_pfn + nr_pages - 1;
> -	return zone_spans_pfn(zone, last_pfn);
> -}
> -
>   static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
>   		int nid, nodemask_t *nodemask)
>   {
> -	unsigned int order = huge_page_order(h);
> -	unsigned long nr_pages = 1 << order;
> -	unsigned long ret, pfn, flags;
> -	struct zonelist *zonelist;
> -	struct zone *zone;
> -	struct zoneref *z;
> -
> -	zonelist = node_zonelist(nid, gfp_mask);
> -	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
> -		spin_lock_irqsave(&zone->lock, flags);
> +	unsigned long nr_pages = 1UL << huge_page_order(h);
>   
> -		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> -		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
> -			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
> -				/*
> -				 * We release the zone lock here because
> -				 * alloc_contig_range() will also lock the zone
> -				 * at some point. If there's an allocation
> -				 * spinning on this lock, it may win the race
> -				 * and cause alloc_contig_range() to fail...
> -				 */
> -				spin_unlock_irqrestore(&zone->lock, flags);
> -				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
> -				if (!ret)
> -					return pfn_to_page(pfn);
> -				spin_lock_irqsave(&zone->lock, flags);
> -			}
> -			pfn += nr_pages;
> -		}
> -
> -		spin_unlock_irqrestore(&zone->lock, flags);
> -	}
> -
> -	return NULL;
> +	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
>   }
>   
>   static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index cd1dd0712624..1e084d6acf2f 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8499,6 +8499,103 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   				pfn_max_align_up(end), migratetype);
>   	return ret;
>   }
> +
> +static int __alloc_contig_pages(unsigned long start_pfn,
> +				unsigned long nr_pages, gfp_t gfp_mask)
> +{
> +	unsigned long end_pfn = start_pfn + nr_pages;
> +
> +	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
> +				  gfp_mask);
> +}
> +
> +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
> +				   unsigned long nr_pages)
> +{
> +	unsigned long i, end_pfn = start_pfn + nr_pages;
> +	struct page *page;
> +
> +	for (i = start_pfn; i < end_pfn; i++) {
> +		page = pfn_to_online_page(i);
> +		if (!page)
> +			return false;
> +
> +		if (page_zone(page) != z)
> +			return false;
> +
> +		if (PageReserved(page))
> +			return false;
> +
> +		if (page_count(page) > 0)
> +			return false;
> +
> +		if (PageHuge(page))
> +			return false;
> +	}

We might still try to allocate a lot of ranges that contain unmovable 
data (we could avoid isolating a lot of page blocks in the first place). 
I'd love to see something like pfn_range_movable() (similar, but 
different to is_mem_section_removable(), which uses has_unmovable_pages()).

Especially, I want something like that for virtio-mem, which also uses 
alloc_contig_range/free_contig_range, to perform a fast test if a range 
makes sense to try with alloc_contig_range().

https://lkml.org/lkml/2019/9/19/463


> +	return true;
> +}
> +
> +static bool zone_spans_last_pfn(const struct zone *zone,
> +				unsigned long start_pfn, unsigned long nr_pages)
> +{
> +	unsigned long last_pfn = start_pfn + nr_pages - 1;
> +
> +	return zone_spans_pfn(zone, last_pfn);
> +}
> +
> +/**
> + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
> + * @nr_pages:	Number of contiguous pages to allocate
> + * @gfp_mask:	GFP mask to limit search and used during compaction
> + * @nid:	Target node
> + * @nodemask:	Mask for other possible nodes
> + *
> + * This routine is an wrapper around alloc_contig_range(). It scans over zones

"a" wrapper

> + * on an applicable zonelist to find a contiguous pfn range which can then be
> + * tried for allocation with alloc_contig_range(). This routine is intended
> + * for allocation requests which can not be fulfilled with buddy allocator.

"the" buddy allocator.

> + *
> + * Allocated pages can be freed with free_contig_range() or by manually calling
> + * __free_page() on each allocated page.
> + *
> + * Return: pointer to 'order' pages on success, or NULL if not successful.

order?

> + */
> +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
> +				int nid, nodemask_t *nodemask)
> +{
> +	unsigned long ret, pfn, flags;
> +	struct zonelist *zonelist;
> +	struct zone *zone;
> +	struct zoneref *z;
> +
> +	zonelist = node_zonelist(nid, gfp_mask);
> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> +					gfp_zone(gfp_mask), nodemask) {

One important part is to never use the MOVABLE zone here (otherwise 
unmovable data would end up on the movable zone). But I guess the caller 
is responsible for that (not pass GFP_MOVABLE) like gigantic pages do.

> +		spin_lock_irqsave(&zone->lock, flags);
> +
> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);

This alignment does not make too much sense when allowing passing in 
!power of two orders. Maybe the caller should specify the requested 
alignment instead? Or should we enforce this to be aligned to make our 
life easier for now?

> +		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
> +			if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
> +				/*
> +				 * We release the zone lock here because
> +				 * alloc_contig_range() will also lock the zone
> +				 * at some point. If there's an allocation
> +				 * spinning on this lock, it may win the race
> +				 * and cause alloc_contig_range() to fail...
> +				 */
> +				spin_unlock_irqrestore(&zone->lock, flags);
> +				ret = __alloc_contig_pages(pfn, nr_pages,
> +							gfp_mask);
> +				if (!ret)
> +					return pfn_to_page(pfn);
> +				spin_lock_irqsave(&zone->lock, flags);
> +			}
> +			pfn += nr_pages;

Now, that's interesting. When you relax the alignment restriction, 
skipping over the whole range is no longer such a good idea (imagine 
only the first PFN not being movable).

> +		}
> +		spin_unlock_irqrestore(&zone->lock, flags);
> +	}
> +	return NULL;
> +}
>   #endif /* CONFIG_CONTIG_ALLOC */
>   
>   void free_contig_range(unsigned long pfn, unsigned int nr_pages)
>
David Hildenbrand Oct. 16, 2019, 8:40 a.m. UTC | #2
On 16.10.19 10:08, David Hildenbrand wrote:
> On 16.10.19 09:34, Anshuman Khandual wrote:
>> HugeTLB helper alloc_gigantic_page() implements fairly generic allocation
>> method where it scans over various zones looking for a large contiguous pfn
>> range before trying to allocate it with alloc_contig_range(). Other than
>> deriving the requested order from 'struct hstate', there is nothing HugeTLB
>> specific in there. This can be made available for general use to allocate
>> contiguous memory which could not have been allocated through the buddy
>> allocator.
>>
>> alloc_gigantic_page() has been split carving out actual allocation method
>> which is then made available via new alloc_contig_pages() helper wrapped
>> under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced
>> with more generic term 'contig'. Allocated pages here should be freed with
>> free_contig_range() or by calling __free_page() on each allocated page.
>>
>> Cc: Mike Kravetz <mike.kravetz@oracle.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: David Rientjes <rientjes@google.com>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Mel Gorman <mgorman@techsingularity.net>
>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>> Cc: Dan Williams <dan.j.williams@intel.com>
>> Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>> This is based on https://patchwork.kernel.org/patch/11190213/
>>
>> Changes from [V5,1/2] mm/hugetlb: Make alloc_gigantic_page()...
>>
>> - alloc_contig_page() takes nr_pages instead of order per Michal
>> - s/gigantic/contig on all related functions
>>
>>    include/linux/gfp.h |  2 +
>>    mm/hugetlb.c        | 77 +----------------------------------
>>    mm/page_alloc.c     | 97 +++++++++++++++++++++++++++++++++++++++++++++
>>    3 files changed, 101 insertions(+), 75 deletions(-)
>>
>> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>> index fb07b503dc45..1a11d4857027 100644
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -589,6 +589,8 @@ static inline bool pm_suspended_storage(void)
>>    /* The below functions must be run on a range from a single zone. */
>>    extern int alloc_contig_range(unsigned long start, unsigned long end,
>>    			      unsigned migratetype, gfp_t gfp_mask);
>> +extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
>> +				       int nid, nodemask_t *nodemask);
>>    #endif
>>    void free_contig_range(unsigned long pfn, unsigned int nr_pages);
>>    
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 985ee15eb04b..a5c2c880af27 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1023,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
>>    }
>>    
>>    #ifdef CONFIG_CONTIG_ALLOC
>> -static int __alloc_gigantic_page(unsigned long start_pfn,
>> -				unsigned long nr_pages, gfp_t gfp_mask)
>> -{
>> -	unsigned long end_pfn = start_pfn + nr_pages;
>> -	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
>> -				  gfp_mask);
>> -}
>> -
>> -static bool pfn_range_valid_gigantic(struct zone *z,
>> -			unsigned long start_pfn, unsigned long nr_pages)
>> -{
>> -	unsigned long i, end_pfn = start_pfn + nr_pages;
>> -	struct page *page;
>> -
>> -	for (i = start_pfn; i < end_pfn; i++) {
>> -		page = pfn_to_online_page(i);
>> -		if (!page)
>> -			return false;
>> -
>> -		if (page_zone(page) != z)
>> -			return false;
>> -
>> -		if (PageReserved(page))
>> -			return false;
>> -
>> -		if (page_count(page) > 0)
>> -			return false;
>> -
>> -		if (PageHuge(page))
>> -			return false;
>> -	}
>> -
>> -	return true;
>> -}
>> -
>> -static bool zone_spans_last_pfn(const struct zone *zone,
>> -			unsigned long start_pfn, unsigned long nr_pages)
>> -{
>> -	unsigned long last_pfn = start_pfn + nr_pages - 1;
>> -	return zone_spans_pfn(zone, last_pfn);
>> -}
>> -
>>    static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
>>    		int nid, nodemask_t *nodemask)
>>    {
>> -	unsigned int order = huge_page_order(h);
>> -	unsigned long nr_pages = 1 << order;
>> -	unsigned long ret, pfn, flags;
>> -	struct zonelist *zonelist;
>> -	struct zone *zone;
>> -	struct zoneref *z;
>> -
>> -	zonelist = node_zonelist(nid, gfp_mask);
>> -	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
>> -		spin_lock_irqsave(&zone->lock, flags);
>> +	unsigned long nr_pages = 1UL << huge_page_order(h);
>>    
>> -		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
>> -		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
>> -			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
>> -				/*
>> -				 * We release the zone lock here because
>> -				 * alloc_contig_range() will also lock the zone
>> -				 * at some point. If there's an allocation
>> -				 * spinning on this lock, it may win the race
>> -				 * and cause alloc_contig_range() to fail...
>> -				 */
>> -				spin_unlock_irqrestore(&zone->lock, flags);
>> -				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
>> -				if (!ret)
>> -					return pfn_to_page(pfn);
>> -				spin_lock_irqsave(&zone->lock, flags);
>> -			}
>> -			pfn += nr_pages;
>> -		}
>> -
>> -		spin_unlock_irqrestore(&zone->lock, flags);
>> -	}
>> -
>> -	return NULL;
>> +	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
>>    }
>>    
>>    static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index cd1dd0712624..1e084d6acf2f 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -8499,6 +8499,103 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>    				pfn_max_align_up(end), migratetype);
>>    	return ret;
>>    }
>> +
>> +static int __alloc_contig_pages(unsigned long start_pfn,
>> +				unsigned long nr_pages, gfp_t gfp_mask)
>> +{
>> +	unsigned long end_pfn = start_pfn + nr_pages;
>> +
>> +	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
>> +				  gfp_mask);
>> +}
>> +
>> +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
>> +				   unsigned long nr_pages)
>> +{
>> +	unsigned long i, end_pfn = start_pfn + nr_pages;
>> +	struct page *page;
>> +
>> +	for (i = start_pfn; i < end_pfn; i++) {
>> +		page = pfn_to_online_page(i);
>> +		if (!page)
>> +			return false;
>> +
>> +		if (page_zone(page) != z)
>> +			return false;
>> +
>> +		if (PageReserved(page))
>> +			return false;
>> +
>> +		if (page_count(page) > 0)
>> +			return false;
>> +
>> +		if (PageHuge(page))
>> +			return false;
>> +	}
> 
> We might still try to allocate a lot of ranges that contain unmovable
> data (we could avoid isolating a lot of page blocks in the first place).
> I'd love to see something like pfn_range_movable() (similar, but
> different to is_mem_section_removable(), which uses has_unmovable_pages()).
> 
> Especially, I want something like that for virtio-mem, which also uses
> alloc_contig_range/free_contig_range, to perform a fast test if a range
> makes sense to try with alloc_contig_range().
> 
> https://lkml.org/lkml/2019/9/19/463
> 
> 
>> +	return true;
>> +}
>> +
>> +static bool zone_spans_last_pfn(const struct zone *zone,
>> +				unsigned long start_pfn, unsigned long nr_pages)
>> +{
>> +	unsigned long last_pfn = start_pfn + nr_pages - 1;
>> +
>> +	return zone_spans_pfn(zone, last_pfn);
>> +}
>> +
>> +/**
>> + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
>> + * @nr_pages:	Number of contiguous pages to allocate
>> + * @gfp_mask:	GFP mask to limit search and used during compaction
>> + * @nid:	Target node
>> + * @nodemask:	Mask for other possible nodes
>> + *
>> + * This routine is an wrapper around alloc_contig_range(). It scans over zones
> 
> "a" wrapper
> 
>> + * on an applicable zonelist to find a contiguous pfn range which can then be
>> + * tried for allocation with alloc_contig_range(). This routine is intended
>> + * for allocation requests which can not be fulfilled with buddy allocator.
> 
> "the" buddy allocator.
> 
>> + *
>> + * Allocated pages can be freed with free_contig_range() or by manually calling
>> + * __free_page() on each allocated page.
>> + *
>> + * Return: pointer to 'order' pages on success, or NULL if not successful.
> 
> order?
> 
>> + */
>> +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
>> +				int nid, nodemask_t *nodemask)
>> +{
>> +	unsigned long ret, pfn, flags;
>> +	struct zonelist *zonelist;
>> +	struct zone *zone;
>> +	struct zoneref *z;
>> +
>> +	zonelist = node_zonelist(nid, gfp_mask);
>> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
>> +					gfp_zone(gfp_mask), nodemask) {
> 
> One important part is to never use the MOVABLE zone here (otherwise
> unmovable data would end up on the movable zone). But I guess the caller
> is responsible for that (not pass GFP_MOVABLE) like gigantic pages do.
> 
>> +		spin_lock_irqsave(&zone->lock, flags);
>> +
>> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> 
> This alignment does not make too much sense when allowing passing in
> !power of two orders. Maybe the caller should specify the requested
> alignment instead? Or should we enforce this to be aligned to make our
> life easier for now?
> 
>> +		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
>> +			if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
>> +				/*
>> +				 * We release the zone lock here because
>> +				 * alloc_contig_range() will also lock the zone
>> +				 * at some point. If there's an allocation
>> +				 * spinning on this lock, it may win the race
>> +				 * and cause alloc_contig_range() to fail...
>> +				 */
>> +				spin_unlock_irqrestore(&zone->lock, flags);
>> +				ret = __alloc_contig_pages(pfn, nr_pages,
>> +							gfp_mask);
>> +				if (!ret)
>> +					return pfn_to_page(pfn);
>> +				spin_lock_irqsave(&zone->lock, flags);
>> +			}
>> +			pfn += nr_pages;
> 
> Now, that's interesting. When you relax the alignment restriction,
> skipping over the whole range is no longer such a good idea (imagine
> only the first PFN not being movable).
> 
>> +		}
>> +		spin_unlock_irqrestore(&zone->lock, flags);
>> +	}
>> +	return NULL;
>> +}
>>    #endif /* CONFIG_CONTIG_ALLOC */
>>    
>>    void free_contig_range(unsigned long pfn, unsigned int nr_pages)
>>
> 
> 

FWIW, the patch subject needs a rework, too.
Michal Hocko Oct. 16, 2019, 8:51 a.m. UTC | #3
On Wed 16-10-19 10:08:21, David Hildenbrand wrote:
> On 16.10.19 09:34, Anshuman Khandual wrote:
[...]
> > +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
> > +				   unsigned long nr_pages)
> > +{
> > +	unsigned long i, end_pfn = start_pfn + nr_pages;
> > +	struct page *page;
> > +
> > +	for (i = start_pfn; i < end_pfn; i++) {
> > +		page = pfn_to_online_page(i);
> > +		if (!page)
> > +			return false;
> > +
> > +		if (page_zone(page) != z)
> > +			return false;
> > +
> > +		if (PageReserved(page))
> > +			return false;
> > +
> > +		if (page_count(page) > 0)
> > +			return false;
> > +
> > +		if (PageHuge(page))
> > +			return false;
> > +	}
> 
> We might still try to allocate a lot of ranges that contain unmovable data
> (we could avoid isolating a lot of page blocks in the first place). I'd love
> to see something like pfn_range_movable() (similar, but different to
> is_mem_section_removable(), which uses has_unmovable_pages()).

Just to make sure I understand. Do you want has_unmovable_pages to be
called inside pfn_range_valid_contig?
[...]
> > +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
> > +				int nid, nodemask_t *nodemask)
> > +{
> > +	unsigned long ret, pfn, flags;
> > +	struct zonelist *zonelist;
> > +	struct zone *zone;
> > +	struct zoneref *z;
> > +
> > +	zonelist = node_zonelist(nid, gfp_mask);
> > +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> > +					gfp_zone(gfp_mask), nodemask) {
> 
> One important part is to never use the MOVABLE zone here (otherwise
> unmovable data would end up on the movable zone). But I guess the caller is
> responsible for that (not pass GFP_MOVABLE) like gigantic pages do.

Well, if the caller uses GFP_MOVABLE then the movability should be
implemented in some form. If that is not the case then it is a bug on
the caller behalf.

> > +		spin_lock_irqsave(&zone->lock, flags);
> > +
> > +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> 
> This alignment does not make too much sense when allowing passing in !power
> of two orders. Maybe the caller should specify the requested alignment
> instead? Or should we enforce this to be aligned to make our life easier for
> now?

Are there any usecases that would require than the page alignment?
David Hildenbrand Oct. 16, 2019, 8:56 a.m. UTC | #4
On 16.10.19 10:51, Michal Hocko wrote:
> On Wed 16-10-19 10:08:21, David Hildenbrand wrote:
>> On 16.10.19 09:34, Anshuman Khandual wrote:
> [...]
>>> +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
>>> +				   unsigned long nr_pages)
>>> +{
>>> +	unsigned long i, end_pfn = start_pfn + nr_pages;
>>> +	struct page *page;
>>> +
>>> +	for (i = start_pfn; i < end_pfn; i++) {
>>> +		page = pfn_to_online_page(i);
>>> +		if (!page)
>>> +			return false;
>>> +
>>> +		if (page_zone(page) != z)
>>> +			return false;
>>> +
>>> +		if (PageReserved(page))
>>> +			return false;
>>> +
>>> +		if (page_count(page) > 0)
>>> +			return false;
>>> +
>>> +		if (PageHuge(page))
>>> +			return false;
>>> +	}
>>
>> We might still try to allocate a lot of ranges that contain unmovable data
>> (we could avoid isolating a lot of page blocks in the first place). I'd love
>> to see something like pfn_range_movable() (similar, but different to
>> is_mem_section_removable(), which uses has_unmovable_pages()).
> 
> Just to make sure I understand. Do you want has_unmovable_pages to be
> called inside pfn_range_valid_contig?

I think this requires more thought, as has_unmovable_pages() works on 
pageblocks only AFAIK. If you try to allocate < MAX_ORDER - 1, you could 
get a lot of false positives.

E.g., if a free "MAX_ORDER - 1" page spans two pageblocks and you only 
test the second pageblock, you might detect "unmovable" if not taking 
proper care of the "bigger" free page. (alloc_contig_range() properly 
works around that issue)


> [...]
>>> +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
>>> +				int nid, nodemask_t *nodemask)
>>> +{
>>> +	unsigned long ret, pfn, flags;
>>> +	struct zonelist *zonelist;
>>> +	struct zone *zone;
>>> +	struct zoneref *z;
>>> +
>>> +	zonelist = node_zonelist(nid, gfp_mask);
>>> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>> +					gfp_zone(gfp_mask), nodemask) {
>>
>> One important part is to never use the MOVABLE zone here (otherwise
>> unmovable data would end up on the movable zone). But I guess the caller is
>> responsible for that (not pass GFP_MOVABLE) like gigantic pages do.
> 
> Well, if the caller uses GFP_MOVABLE then the movability should be
> implemented in some form. If that is not the case then it is a bug on
> the caller behalf.
> 
>>> +		spin_lock_irqsave(&zone->lock, flags);
>>> +
>>> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
>>
>> This alignment does not make too much sense when allowing passing in !power
>> of two orders. Maybe the caller should specify the requested alignment
>> instead? Or should we enforce this to be aligned to make our life easier for
>> now?
> 
> Are there any usecases that would require than the page alignment?

Gigantic pages have to be aligned AFAIK.
Michal Hocko Oct. 16, 2019, 8:58 a.m. UTC | #5
On Wed 16-10-19 13:04:53, Anshuman Khandual wrote:
> HugeTLB helper alloc_gigantic_page() implements fairly generic allocation
> method where it scans over various zones looking for a large contiguous pfn
> range before trying to allocate it with alloc_contig_range(). Other than
> deriving the requested order from 'struct hstate', there is nothing HugeTLB
> specific in there. This can be made available for general use to allocate
> contiguous memory which could not have been allocated through the buddy
> allocator.
> 
> alloc_gigantic_page() has been split carving out actual allocation method
> which is then made available via new alloc_contig_pages() helper wrapped
> under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced
> with more generic term 'contig'. Allocated pages here should be freed with
> free_contig_range() or by calling __free_page() on each allocated page.

Do we want to export this to modules? Apart from mostly styling issues
pointed by David this looks fine.

I do agree that a general allocator api belongs to page_alloc.c rather
than force people to invent their own and broken instances.
 
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Mike Rapoport <rppt@linux.ibm.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>

After other issues mentioned by David get resolved you can add
Acked-by: Michal Hocko <mhocko@suse.com>

> ---
> This is based on https://patchwork.kernel.org/patch/11190213/
> 
> Changes from [V5,1/2] mm/hugetlb: Make alloc_gigantic_page()...
> 
> - alloc_contig_page() takes nr_pages instead of order per Michal
> - s/gigantic/contig on all related functions
> 
>  include/linux/gfp.h |  2 +
>  mm/hugetlb.c        | 77 +----------------------------------
>  mm/page_alloc.c     | 97 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 101 insertions(+), 75 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index fb07b503dc45..1a11d4857027 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -589,6 +589,8 @@ static inline bool pm_suspended_storage(void)
>  /* The below functions must be run on a range from a single zone. */
>  extern int alloc_contig_range(unsigned long start, unsigned long end,
>  			      unsigned migratetype, gfp_t gfp_mask);
> +extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
> +				       int nid, nodemask_t *nodemask);
>  #endif
>  void free_contig_range(unsigned long pfn, unsigned int nr_pages);
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 985ee15eb04b..a5c2c880af27 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1023,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
>  }
>  
>  #ifdef CONFIG_CONTIG_ALLOC
> -static int __alloc_gigantic_page(unsigned long start_pfn,
> -				unsigned long nr_pages, gfp_t gfp_mask)
> -{
> -	unsigned long end_pfn = start_pfn + nr_pages;
> -	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
> -				  gfp_mask);
> -}
> -
> -static bool pfn_range_valid_gigantic(struct zone *z,
> -			unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	unsigned long i, end_pfn = start_pfn + nr_pages;
> -	struct page *page;
> -
> -	for (i = start_pfn; i < end_pfn; i++) {
> -		page = pfn_to_online_page(i);
> -		if (!page)
> -			return false;
> -
> -		if (page_zone(page) != z)
> -			return false;
> -
> -		if (PageReserved(page))
> -			return false;
> -
> -		if (page_count(page) > 0)
> -			return false;
> -
> -		if (PageHuge(page))
> -			return false;
> -	}
> -
> -	return true;
> -}
> -
> -static bool zone_spans_last_pfn(const struct zone *zone,
> -			unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	unsigned long last_pfn = start_pfn + nr_pages - 1;
> -	return zone_spans_pfn(zone, last_pfn);
> -}
> -
>  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
>  		int nid, nodemask_t *nodemask)
>  {
> -	unsigned int order = huge_page_order(h);
> -	unsigned long nr_pages = 1 << order;
> -	unsigned long ret, pfn, flags;
> -	struct zonelist *zonelist;
> -	struct zone *zone;
> -	struct zoneref *z;
> -
> -	zonelist = node_zonelist(nid, gfp_mask);
> -	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
> -		spin_lock_irqsave(&zone->lock, flags);
> +	unsigned long nr_pages = 1UL << huge_page_order(h);
>  
> -		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> -		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
> -			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
> -				/*
> -				 * We release the zone lock here because
> -				 * alloc_contig_range() will also lock the zone
> -				 * at some point. If there's an allocation
> -				 * spinning on this lock, it may win the race
> -				 * and cause alloc_contig_range() to fail...
> -				 */
> -				spin_unlock_irqrestore(&zone->lock, flags);
> -				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
> -				if (!ret)
> -					return pfn_to_page(pfn);
> -				spin_lock_irqsave(&zone->lock, flags);
> -			}
> -			pfn += nr_pages;
> -		}
> -
> -		spin_unlock_irqrestore(&zone->lock, flags);
> -	}
> -
> -	return NULL;
> +	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
>  }
>  
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index cd1dd0712624..1e084d6acf2f 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8499,6 +8499,103 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  				pfn_max_align_up(end), migratetype);
>  	return ret;
>  }
> +
> +static int __alloc_contig_pages(unsigned long start_pfn,
> +				unsigned long nr_pages, gfp_t gfp_mask)
> +{
> +	unsigned long end_pfn = start_pfn + nr_pages;
> +
> +	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
> +				  gfp_mask);
> +}
> +
> +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
> +				   unsigned long nr_pages)
> +{
> +	unsigned long i, end_pfn = start_pfn + nr_pages;
> +	struct page *page;
> +
> +	for (i = start_pfn; i < end_pfn; i++) {
> +		page = pfn_to_online_page(i);
> +		if (!page)
> +			return false;
> +
> +		if (page_zone(page) != z)
> +			return false;
> +
> +		if (PageReserved(page))
> +			return false;
> +
> +		if (page_count(page) > 0)
> +			return false;
> +
> +		if (PageHuge(page))
> +			return false;
> +	}
> +	return true;
> +}
> +
> +static bool zone_spans_last_pfn(const struct zone *zone,
> +				unsigned long start_pfn, unsigned long nr_pages)
> +{
> +	unsigned long last_pfn = start_pfn + nr_pages - 1;
> +
> +	return zone_spans_pfn(zone, last_pfn);
> +}
> +
> +/**
> + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
> + * @nr_pages:	Number of contiguous pages to allocate
> + * @gfp_mask:	GFP mask to limit search and used during compaction
> + * @nid:	Target node
> + * @nodemask:	Mask for other possible nodes
> + *
> + * This routine is an wrapper around alloc_contig_range(). It scans over zones
> + * on an applicable zonelist to find a contiguous pfn range which can then be
> + * tried for allocation with alloc_contig_range(). This routine is intended
> + * for allocation requests which can not be fulfilled with buddy allocator.
> + *
> + * Allocated pages can be freed with free_contig_range() or by manually calling
> + * __free_page() on each allocated page.
> + *
> + * Return: pointer to 'order' pages on success, or NULL if not successful.
> + */
> +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
> +				int nid, nodemask_t *nodemask)
> +{
> +	unsigned long ret, pfn, flags;
> +	struct zonelist *zonelist;
> +	struct zone *zone;
> +	struct zoneref *z;
> +
> +	zonelist = node_zonelist(nid, gfp_mask);
> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> +					gfp_zone(gfp_mask), nodemask) {
> +		spin_lock_irqsave(&zone->lock, flags);
> +
> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> +		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
> +			if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
> +				/*
> +				 * We release the zone lock here because
> +				 * alloc_contig_range() will also lock the zone
> +				 * at some point. If there's an allocation
> +				 * spinning on this lock, it may win the race
> +				 * and cause alloc_contig_range() to fail...
> +				 */
> +				spin_unlock_irqrestore(&zone->lock, flags);
> +				ret = __alloc_contig_pages(pfn, nr_pages,
> +							gfp_mask);
> +				if (!ret)
> +					return pfn_to_page(pfn);
> +				spin_lock_irqsave(&zone->lock, flags);
> +			}
> +			pfn += nr_pages;
> +		}
> +		spin_unlock_irqrestore(&zone->lock, flags);
> +	}
> +	return NULL;
> +}
>  #endif /* CONFIG_CONTIG_ALLOC */
>  
>  void free_contig_range(unsigned long pfn, unsigned int nr_pages)
> -- 
> 2.20.1
Anshuman Khandual Oct. 16, 2019, 10:28 a.m. UTC | #6
On 10/16/2019 02:28 PM, Michal Hocko wrote:
> On Wed 16-10-19 13:04:53, Anshuman Khandual wrote:
>> HugeTLB helper alloc_gigantic_page() implements fairly generic allocation
>> method where it scans over various zones looking for a large contiguous pfn
>> range before trying to allocate it with alloc_contig_range(). Other than
>> deriving the requested order from 'struct hstate', there is nothing HugeTLB
>> specific in there. This can be made available for general use to allocate
>> contiguous memory which could not have been allocated through the buddy
>> allocator.
>>
>> alloc_gigantic_page() has been split carving out actual allocation method
>> which is then made available via new alloc_contig_pages() helper wrapped
>> under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced
>> with more generic term 'contig'. Allocated pages here should be freed with
>> free_contig_range() or by calling __free_page() on each allocated page.
> 
> Do we want to export this to modules? Apart from mostly styling issues
> pointed by David this looks fine.
> 
> I do agree that a general allocator api belongs to page_alloc.c rather
> than force people to invent their own and broken instances.
>  
>> Cc: Mike Kravetz <mike.kravetz@oracle.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: David Rientjes <rientjes@google.com>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Mel Gorman <mgorman@techsingularity.net>
>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>> Cc: Dan Williams <dan.j.williams@intel.com>
>> Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> 
> After other issues mentioned by David get resolved you can add

Just to confirm. Only the styling issues, right ? pfn_range_valid_contig(),
pfn alignment and zone scanning all remain the same like before ?

> Acked-by: Michal Hocko <mhocko@suse.com>
> 
>> ---
>> This is based on https://patchwork.kernel.org/patch/11190213/
>>
>> Changes from [V5,1/2] mm/hugetlb: Make alloc_gigantic_page()...
>>
>> - alloc_contig_page() takes nr_pages instead of order per Michal
>> - s/gigantic/contig on all related functions
>>
>>  include/linux/gfp.h |  2 +
>>  mm/hugetlb.c        | 77 +----------------------------------
>>  mm/page_alloc.c     | 97 +++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 101 insertions(+), 75 deletions(-)
>>
>> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>> index fb07b503dc45..1a11d4857027 100644
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -589,6 +589,8 @@ static inline bool pm_suspended_storage(void)
>>  /* The below functions must be run on a range from a single zone. */
>>  extern int alloc_contig_range(unsigned long start, unsigned long end,
>>  			      unsigned migratetype, gfp_t gfp_mask);
>> +extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
>> +				       int nid, nodemask_t *nodemask);
>>  #endif
>>  void free_contig_range(unsigned long pfn, unsigned int nr_pages);
>>  
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 985ee15eb04b..a5c2c880af27 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1023,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
>>  }
>>  
>>  #ifdef CONFIG_CONTIG_ALLOC
>> -static int __alloc_gigantic_page(unsigned long start_pfn,
>> -				unsigned long nr_pages, gfp_t gfp_mask)
>> -{
>> -	unsigned long end_pfn = start_pfn + nr_pages;
>> -	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
>> -				  gfp_mask);
>> -}
>> -
>> -static bool pfn_range_valid_gigantic(struct zone *z,
>> -			unsigned long start_pfn, unsigned long nr_pages)
>> -{
>> -	unsigned long i, end_pfn = start_pfn + nr_pages;
>> -	struct page *page;
>> -
>> -	for (i = start_pfn; i < end_pfn; i++) {
>> -		page = pfn_to_online_page(i);
>> -		if (!page)
>> -			return false;
>> -
>> -		if (page_zone(page) != z)
>> -			return false;
>> -
>> -		if (PageReserved(page))
>> -			return false;
>> -
>> -		if (page_count(page) > 0)
>> -			return false;
>> -
>> -		if (PageHuge(page))
>> -			return false;
>> -	}
>> -
>> -	return true;
>> -}
>> -
>> -static bool zone_spans_last_pfn(const struct zone *zone,
>> -			unsigned long start_pfn, unsigned long nr_pages)
>> -{
>> -	unsigned long last_pfn = start_pfn + nr_pages - 1;
>> -	return zone_spans_pfn(zone, last_pfn);
>> -}
>> -
>>  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
>>  		int nid, nodemask_t *nodemask)
>>  {
>> -	unsigned int order = huge_page_order(h);
>> -	unsigned long nr_pages = 1 << order;
>> -	unsigned long ret, pfn, flags;
>> -	struct zonelist *zonelist;
>> -	struct zone *zone;
>> -	struct zoneref *z;
>> -
>> -	zonelist = node_zonelist(nid, gfp_mask);
>> -	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
>> -		spin_lock_irqsave(&zone->lock, flags);
>> +	unsigned long nr_pages = 1UL << huge_page_order(h);
>>  
>> -		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
>> -		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
>> -			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
>> -				/*
>> -				 * We release the zone lock here because
>> -				 * alloc_contig_range() will also lock the zone
>> -				 * at some point. If there's an allocation
>> -				 * spinning on this lock, it may win the race
>> -				 * and cause alloc_contig_range() to fail...
>> -				 */
>> -				spin_unlock_irqrestore(&zone->lock, flags);
>> -				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
>> -				if (!ret)
>> -					return pfn_to_page(pfn);
>> -				spin_lock_irqsave(&zone->lock, flags);
>> -			}
>> -			pfn += nr_pages;
>> -		}
>> -
>> -		spin_unlock_irqrestore(&zone->lock, flags);
>> -	}
>> -
>> -	return NULL;
>> +	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
>>  }
>>  
>>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index cd1dd0712624..1e084d6acf2f 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -8499,6 +8499,103 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>  				pfn_max_align_up(end), migratetype);
>>  	return ret;
>>  }
>> +
>> +static int __alloc_contig_pages(unsigned long start_pfn,
>> +				unsigned long nr_pages, gfp_t gfp_mask)
>> +{
>> +	unsigned long end_pfn = start_pfn + nr_pages;
>> +
>> +	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
>> +				  gfp_mask);
>> +}
>> +
>> +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
>> +				   unsigned long nr_pages)
>> +{
>> +	unsigned long i, end_pfn = start_pfn + nr_pages;
>> +	struct page *page;
>> +
>> +	for (i = start_pfn; i < end_pfn; i++) {
>> +		page = pfn_to_online_page(i);
>> +		if (!page)
>> +			return false;
>> +
>> +		if (page_zone(page) != z)
>> +			return false;
>> +
>> +		if (PageReserved(page))
>> +			return false;
>> +
>> +		if (page_count(page) > 0)
>> +			return false;
>> +
>> +		if (PageHuge(page))
>> +			return false;
>> +	}
>> +	return true;
>> +}
>> +
>> +static bool zone_spans_last_pfn(const struct zone *zone,
>> +				unsigned long start_pfn, unsigned long nr_pages)
>> +{
>> +	unsigned long last_pfn = start_pfn + nr_pages - 1;
>> +
>> +	return zone_spans_pfn(zone, last_pfn);
>> +}
>> +
>> +/**
>> + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
>> + * @nr_pages:	Number of contiguous pages to allocate
>> + * @gfp_mask:	GFP mask to limit search and used during compaction
>> + * @nid:	Target node
>> + * @nodemask:	Mask for other possible nodes
>> + *
>> + * This routine is an wrapper around alloc_contig_range(). It scans over zones
>> + * on an applicable zonelist to find a contiguous pfn range which can then be
>> + * tried for allocation with alloc_contig_range(). This routine is intended
>> + * for allocation requests which can not be fulfilled with buddy allocator.
>> + *
>> + * Allocated pages can be freed with free_contig_range() or by manually calling
>> + * __free_page() on each allocated page.
>> + *
>> + * Return: pointer to 'order' pages on success, or NULL if not successful.
>> + */
>> +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
>> +				int nid, nodemask_t *nodemask)
>> +{
>> +	unsigned long ret, pfn, flags;
>> +	struct zonelist *zonelist;
>> +	struct zone *zone;
>> +	struct zoneref *z;
>> +
>> +	zonelist = node_zonelist(nid, gfp_mask);
>> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
>> +					gfp_zone(gfp_mask), nodemask) {
>> +		spin_lock_irqsave(&zone->lock, flags);
>> +
>> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
>> +		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
>> +			if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
>> +				/*
>> +				 * We release the zone lock here because
>> +				 * alloc_contig_range() will also lock the zone
>> +				 * at some point. If there's an allocation
>> +				 * spinning on this lock, it may win the race
>> +				 * and cause alloc_contig_range() to fail...
>> +				 */
>> +				spin_unlock_irqrestore(&zone->lock, flags);
>> +				ret = __alloc_contig_pages(pfn, nr_pages,
>> +							gfp_mask);
>> +				if (!ret)
>> +					return pfn_to_page(pfn);
>> +				spin_lock_irqsave(&zone->lock, flags);
>> +			}
>> +			pfn += nr_pages;
>> +		}
>> +		spin_unlock_irqrestore(&zone->lock, flags);
>> +	}
>> +	return NULL;
>> +}
>>  #endif /* CONFIG_CONTIG_ALLOC */
>>  
>>  void free_contig_range(unsigned long pfn, unsigned int nr_pages)
>> -- 
>> 2.20.1
>
Michal Hocko Oct. 16, 2019, 10:33 a.m. UTC | #7
On Wed 16-10-19 15:58:32, Anshuman Khandual wrote:
> 
> 
> On 10/16/2019 02:28 PM, Michal Hocko wrote:
[...]
> > After other issues mentioned by David get resolved you can add
> 
> Just to confirm. Only the styling issues, right ? pfn_range_valid_contig(),
> pfn alignment and zone scanning all remain the same like before ?

Yes, if they need any special handling then let's do it in a separate
patch with a proper justification.
David Hildenbrand Oct. 16, 2019, 10:41 a.m. UTC | #8
On 16.10.19 12:28, Anshuman Khandual wrote:
> 
> 
> On 10/16/2019 02:28 PM, Michal Hocko wrote:
>> On Wed 16-10-19 13:04:53, Anshuman Khandual wrote:
>>> HugeTLB helper alloc_gigantic_page() implements fairly generic allocation
>>> method where it scans over various zones looking for a large contiguous pfn
>>> range before trying to allocate it with alloc_contig_range(). Other than
>>> deriving the requested order from 'struct hstate', there is nothing HugeTLB
>>> specific in there. This can be made available for general use to allocate
>>> contiguous memory which could not have been allocated through the buddy
>>> allocator.
>>>
>>> alloc_gigantic_page() has been split carving out actual allocation method
>>> which is then made available via new alloc_contig_pages() helper wrapped
>>> under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced
>>> with more generic term 'contig'. Allocated pages here should be freed with
>>> free_contig_range() or by calling __free_page() on each allocated page.
>>
>> Do we want to export this to modules? Apart from mostly styling issues
>> pointed by David this looks fine.
>>
>> I do agree that a general allocator api belongs to page_alloc.c rather
>> than force people to invent their own and broken instances.
>>   
>>> Cc: Mike Kravetz <mike.kravetz@oracle.com>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: Vlastimil Babka <vbabka@suse.cz>
>>> Cc: Michal Hocko <mhocko@suse.com>
>>> Cc: David Rientjes <rientjes@google.com>
>>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>>> Cc: Oscar Salvador <osalvador@suse.de>
>>> Cc: Mel Gorman <mgorman@techsingularity.net>
>>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>> Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
>>> Cc: Matthew Wilcox <willy@infradead.org>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: linux-kernel@vger.kernel.org
>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>
>> After other issues mentioned by David get resolved you can add
> 
> Just to confirm. Only the styling issues, right ? pfn_range_valid_contig(),
> pfn alignment and zone scanning all remain the same like before ?
> 

Fine with me. Please do think about the alignment thingy. (I suggest to 
just enforce an alignment to a power of two for now (and return NULL if 
not a power of two), we can always relax that when needed - and then 
think about a better search for !power of two)
Michal Hocko Oct. 16, 2019, 11:08 a.m. UTC | #9
On Wed 16-10-19 10:56:16, David Hildenbrand wrote:
> On 16.10.19 10:51, Michal Hocko wrote:
> > On Wed 16-10-19 10:08:21, David Hildenbrand wrote:
> > > On 16.10.19 09:34, Anshuman Khandual wrote:
> > [...]
> > > > +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
> > > > +				   unsigned long nr_pages)
> > > > +{
> > > > +	unsigned long i, end_pfn = start_pfn + nr_pages;
> > > > +	struct page *page;
> > > > +
> > > > +	for (i = start_pfn; i < end_pfn; i++) {
> > > > +		page = pfn_to_online_page(i);
> > > > +		if (!page)
> > > > +			return false;
> > > > +
> > > > +		if (page_zone(page) != z)
> > > > +			return false;
> > > > +
> > > > +		if (PageReserved(page))
> > > > +			return false;
> > > > +
> > > > +		if (page_count(page) > 0)
> > > > +			return false;
> > > > +
> > > > +		if (PageHuge(page))
> > > > +			return false;
> > > > +	}
> > > 
> > > We might still try to allocate a lot of ranges that contain unmovable data
> > > (we could avoid isolating a lot of page blocks in the first place). I'd love
> > > to see something like pfn_range_movable() (similar, but different to
> > > is_mem_section_removable(), which uses has_unmovable_pages()).
> > 
> > Just to make sure I understand. Do you want has_unmovable_pages to be
> > called inside pfn_range_valid_contig?
> 
> I think this requires more thought, as has_unmovable_pages() works on
> pageblocks only AFAIK. If you try to allocate < MAX_ORDER - 1, you could get
> a lot of false positives.
> 
> E.g., if a free "MAX_ORDER - 1" page spans two pageblocks and you only test
> the second pageblock, you might detect "unmovable" if not taking proper care
> of the "bigger" free page. (alloc_contig_range() properly works around that
> issue)

OK, I see your point. You are right that false positives are possible. I
would deal with those in a separate patch though.

> > [...]
> > > > +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
> > > > +				int nid, nodemask_t *nodemask)
> > > > +{
> > > > +	unsigned long ret, pfn, flags;
> > > > +	struct zonelist *zonelist;
> > > > +	struct zone *zone;
> > > > +	struct zoneref *z;
> > > > +
> > > > +	zonelist = node_zonelist(nid, gfp_mask);
> > > > +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> > > > +					gfp_zone(gfp_mask), nodemask) {
> > > 
> > > One important part is to never use the MOVABLE zone here (otherwise
> > > unmovable data would end up on the movable zone). But I guess the caller is
> > > responsible for that (not pass GFP_MOVABLE) like gigantic pages do.
> > 
> > Well, if the caller uses GFP_MOVABLE then the movability should be
> > implemented in some form. If that is not the case then it is a bug on
> > the caller behalf.
> > 
> > > > +		spin_lock_irqsave(&zone->lock, flags);
> > > > +
> > > > +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> > > 
> > > This alignment does not make too much sense when allowing passing in !power
> > > of two orders. Maybe the caller should specify the requested alignment
> > > instead? Or should we enforce this to be aligned to make our life easier for
> > > now?
> > 
> > Are there any usecases that would require than the page alignment?
> 
> Gigantic pages have to be aligned AFAIK.

Aligned to what? I do not see any guarantee like that in the existing
code.
David Hildenbrand Oct. 16, 2019, 11:10 a.m. UTC | #10
On 16.10.19 13:08, Michal Hocko wrote:
> On Wed 16-10-19 10:56:16, David Hildenbrand wrote:
>> On 16.10.19 10:51, Michal Hocko wrote:
>>> On Wed 16-10-19 10:08:21, David Hildenbrand wrote:
>>>> On 16.10.19 09:34, Anshuman Khandual wrote:
>>> [...]
>>>>> +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
>>>>> +				   unsigned long nr_pages)
>>>>> +{
>>>>> +	unsigned long i, end_pfn = start_pfn + nr_pages;
>>>>> +	struct page *page;
>>>>> +
>>>>> +	for (i = start_pfn; i < end_pfn; i++) {
>>>>> +		page = pfn_to_online_page(i);
>>>>> +		if (!page)
>>>>> +			return false;
>>>>> +
>>>>> +		if (page_zone(page) != z)
>>>>> +			return false;
>>>>> +
>>>>> +		if (PageReserved(page))
>>>>> +			return false;
>>>>> +
>>>>> +		if (page_count(page) > 0)
>>>>> +			return false;
>>>>> +
>>>>> +		if (PageHuge(page))
>>>>> +			return false;
>>>>> +	}
>>>>
>>>> We might still try to allocate a lot of ranges that contain unmovable data
>>>> (we could avoid isolating a lot of page blocks in the first place). I'd love
>>>> to see something like pfn_range_movable() (similar, but different to
>>>> is_mem_section_removable(), which uses has_unmovable_pages()).
>>>
>>> Just to make sure I understand. Do you want has_unmovable_pages to be
>>> called inside pfn_range_valid_contig?
>>
>> I think this requires more thought, as has_unmovable_pages() works on
>> pageblocks only AFAIK. If you try to allocate < MAX_ORDER - 1, you could get
>> a lot of false positives.
>>
>> E.g., if a free "MAX_ORDER - 1" page spans two pageblocks and you only test
>> the second pageblock, you might detect "unmovable" if not taking proper care
>> of the "bigger" free page. (alloc_contig_range() properly works around that
>> issue)
> 
> OK, I see your point. You are right that false positives are possible. I
> would deal with those in a separate patch though.
> 
>>> [...]
>>>>> +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
>>>>> +				int nid, nodemask_t *nodemask)
>>>>> +{
>>>>> +	unsigned long ret, pfn, flags;
>>>>> +	struct zonelist *zonelist;
>>>>> +	struct zone *zone;
>>>>> +	struct zoneref *z;
>>>>> +
>>>>> +	zonelist = node_zonelist(nid, gfp_mask);
>>>>> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>>>> +					gfp_zone(gfp_mask), nodemask) {
>>>>
>>>> One important part is to never use the MOVABLE zone here (otherwise
>>>> unmovable data would end up on the movable zone). But I guess the caller is
>>>> responsible for that (not pass GFP_MOVABLE) like gigantic pages do.
>>>
>>> Well, if the caller uses GFP_MOVABLE then the movability should be
>>> implemented in some form. If that is not the case then it is a bug on
>>> the caller behalf.
>>>
>>>>> +		spin_lock_irqsave(&zone->lock, flags);
>>>>> +
>>>>> +		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
>>>>
>>>> This alignment does not make too much sense when allowing passing in !power
>>>> of two orders. Maybe the caller should specify the requested alignment
>>>> instead? Or should we enforce this to be aligned to make our life easier for
>>>> now?
>>>
>>> Are there any usecases that would require than the page alignment?
>>
>> Gigantic pages have to be aligned AFAIK.
> 
> Aligned to what? I do not see any guarantee like that in the existing
> code.
> 

pfn = ALIGN(zone->zone_start_pfn, nr_pages);
Michal Hocko Oct. 16, 2019, 11:49 a.m. UTC | #11
On Wed 16-10-19 13:10:41, David Hildenbrand wrote:
> On 16.10.19 13:08, Michal Hocko wrote:
> > On Wed 16-10-19 10:56:16, David Hildenbrand wrote:
[...]
> > > Gigantic pages have to be aligned AFAIK.
> > 
> > Aligned to what? I do not see any guarantee like that in the existing
> > code.
> > 
> 
> pfn = ALIGN(zone->zone_start_pfn, nr_pages);

I am obviously blind! Sorry about the confusion.
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fb07b503dc45..1a11d4857027 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -589,6 +589,8 @@  static inline bool pm_suspended_storage(void)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
 			      unsigned migratetype, gfp_t gfp_mask);
+extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+				       int nid, nodemask_t *nodemask);
 #endif
 void free_contig_range(unsigned long pfn, unsigned int nr_pages);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 985ee15eb04b..a5c2c880af27 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1023,85 +1023,12 @@  static void free_gigantic_page(struct page *page, unsigned int order)
 }
 
 #ifdef CONFIG_CONTIG_ALLOC
-static int __alloc_gigantic_page(unsigned long start_pfn,
-				unsigned long nr_pages, gfp_t gfp_mask)
-{
-	unsigned long end_pfn = start_pfn + nr_pages;
-	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
-				  gfp_mask);
-}
-
-static bool pfn_range_valid_gigantic(struct zone *z,
-			unsigned long start_pfn, unsigned long nr_pages)
-{
-	unsigned long i, end_pfn = start_pfn + nr_pages;
-	struct page *page;
-
-	for (i = start_pfn; i < end_pfn; i++) {
-		page = pfn_to_online_page(i);
-		if (!page)
-			return false;
-
-		if (page_zone(page) != z)
-			return false;
-
-		if (PageReserved(page))
-			return false;
-
-		if (page_count(page) > 0)
-			return false;
-
-		if (PageHuge(page))
-			return false;
-	}
-
-	return true;
-}
-
-static bool zone_spans_last_pfn(const struct zone *zone,
-			unsigned long start_pfn, unsigned long nr_pages)
-{
-	unsigned long last_pfn = start_pfn + nr_pages - 1;
-	return zone_spans_pfn(zone, last_pfn);
-}
-
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 		int nid, nodemask_t *nodemask)
 {
-	unsigned int order = huge_page_order(h);
-	unsigned long nr_pages = 1 << order;
-	unsigned long ret, pfn, flags;
-	struct zonelist *zonelist;
-	struct zone *zone;
-	struct zoneref *z;
-
-	zonelist = node_zonelist(nid, gfp_mask);
-	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
-		spin_lock_irqsave(&zone->lock, flags);
+	unsigned long nr_pages = 1UL << huge_page_order(h);
 
-		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
-		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
-			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
-				/*
-				 * We release the zone lock here because
-				 * alloc_contig_range() will also lock the zone
-				 * at some point. If there's an allocation
-				 * spinning on this lock, it may win the race
-				 * and cause alloc_contig_range() to fail...
-				 */
-				spin_unlock_irqrestore(&zone->lock, flags);
-				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
-				if (!ret)
-					return pfn_to_page(pfn);
-				spin_lock_irqsave(&zone->lock, flags);
-			}
-			pfn += nr_pages;
-		}
-
-		spin_unlock_irqrestore(&zone->lock, flags);
-	}
-
-	return NULL;
+	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cd1dd0712624..1e084d6acf2f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8499,6 +8499,103 @@  int alloc_contig_range(unsigned long start, unsigned long end,
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
+
+static int __alloc_contig_pages(unsigned long start_pfn,
+				unsigned long nr_pages, gfp_t gfp_mask)
+{
+	unsigned long end_pfn = start_pfn + nr_pages;
+
+	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+				  gfp_mask);
+}
+
+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
+				   unsigned long nr_pages)
+{
+	unsigned long i, end_pfn = start_pfn + nr_pages;
+	struct page *page;
+
+	for (i = start_pfn; i < end_pfn; i++) {
+		page = pfn_to_online_page(i);
+		if (!page)
+			return false;
+
+		if (page_zone(page) != z)
+			return false;
+
+		if (PageReserved(page))
+			return false;
+
+		if (page_count(page) > 0)
+			return false;
+
+		if (PageHuge(page))
+			return false;
+	}
+	return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+				unsigned long start_pfn, unsigned long nr_pages)
+{
+	unsigned long last_pfn = start_pfn + nr_pages - 1;
+
+	return zone_spans_pfn(zone, last_pfn);
+}
+
+/**
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
+ * @nr_pages:	Number of contiguous pages to allocate
+ * @gfp_mask:	GFP mask to limit search and used during compaction
+ * @nid:	Target node
+ * @nodemask:	Mask for other possible nodes
+ *
+ * This routine is an wrapper around alloc_contig_range(). It scans over zones
+ * on an applicable zonelist to find a contiguous pfn range which can then be
+ * tried for allocation with alloc_contig_range(). This routine is intended
+ * for allocation requests which can not be fulfilled with buddy allocator.
+ *
+ * Allocated pages can be freed with free_contig_range() or by manually calling
+ * __free_page() on each allocated page.
+ *
+ * Return: pointer to 'order' pages on success, or NULL if not successful.
+ */
+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+				int nid, nodemask_t *nodemask)
+{
+	unsigned long ret, pfn, flags;
+	struct zonelist *zonelist;
+	struct zone *zone;
+	struct zoneref *z;
+
+	zonelist = node_zonelist(nid, gfp_mask);
+	for_each_zone_zonelist_nodemask(zone, z, zonelist,
+					gfp_zone(gfp_mask), nodemask) {
+		spin_lock_irqsave(&zone->lock, flags);
+
+		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+			if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
+				/*
+				 * We release the zone lock here because
+				 * alloc_contig_range() will also lock the zone
+				 * at some point. If there's an allocation
+				 * spinning on this lock, it may win the race
+				 * and cause alloc_contig_range() to fail...
+				 */
+				spin_unlock_irqrestore(&zone->lock, flags);
+				ret = __alloc_contig_pages(pfn, nr_pages,
+							gfp_mask);
+				if (!ret)
+					return pfn_to_page(pfn);
+				spin_lock_irqsave(&zone->lock, flags);
+			}
+			pfn += nr_pages;
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+	return NULL;
+}
 #endif /* CONFIG_CONTIG_ALLOC */
 
 void free_contig_range(unsigned long pfn, unsigned int nr_pages)