diff mbox series

[mm,v3,4/6] mm: Move hot-plug specific memory init into separate functions and optimize

Message ID 20181015202716.2171.7284.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series Deferred page init improvements | expand

Commit Message

Alexander Duyck Oct. 15, 2018, 8:27 p.m. UTC
This patch is going through and combining the bits in memmap_init_zone and
memmap_init_zone_device that are related to hotplug into a single function
called __memmap_init_hotplug.

I also took the opportunity to integrate __init_single_page's functionality
into this function. In doing so I can get rid of some of the redundancy
such as the LRU pointers versus the pgmap.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 mm/page_alloc.c |  232 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 159 insertions(+), 73 deletions(-)

Comments

Michal Hocko Oct. 17, 2018, 9:18 a.m. UTC | #1
On Mon 15-10-18 13:27:16, Alexander Duyck wrote:
> This patch is going through and combining the bits in memmap_init_zone and
> memmap_init_zone_device that are related to hotplug into a single function
> called __memmap_init_hotplug.
> 
> I also took the opportunity to integrate __init_single_page's functionality
> into this function. In doing so I can get rid of some of the redundancy
> such as the LRU pointers versus the pgmap.

This patch depends on [1] which I've had some concerns about. It adds
more code on top. I am still not convinced this is the right direction.

[1] http://lkml.kernel.org/r/20180925202053.3576.66039.stgit@localhost.localdomain
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
>  mm/page_alloc.c |  232 ++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 159 insertions(+), 73 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 20e9eb35d75d..92375e7867ba 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1192,6 +1192,94 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
>  #endif
>  }
>  
> +static void __meminit __init_pageblock(unsigned long start_pfn,
> +				       unsigned long nr_pages,
> +				       unsigned long zone, int nid,
> +				       struct dev_pagemap *pgmap,
> +				       bool is_reserved)
> +{
> +	unsigned long nr_pgmask = pageblock_nr_pages - 1;
> +	struct page *start_page = pfn_to_page(start_pfn);
> +	unsigned long pfn = start_pfn + nr_pages - 1;
> +#ifdef WANT_PAGE_VIRTUAL
> +	bool is_highmem = is_highmem_idx(zone);
> +#endif
> +	struct page *page;
> +
> +	/*
> +	 * Enforce the following requirements:
> +	 * size > 0
> +	 * size < pageblock_nr_pages
> +	 * start_pfn -> pfn does not cross pageblock_nr_pages boundary
> +	 */
> +	VM_BUG_ON(((start_pfn ^ pfn) | (nr_pages - 1)) > nr_pgmask);
> +
> +	/*
> +	 * Work from highest page to lowest, this way we will still be
> +	 * warm in the cache when we call set_pageblock_migratetype
> +	 * below.
> +	 *
> +	 * The loop is based around the page pointer as the main index
> +	 * instead of the pfn because pfn is not used inside the loop if
> +	 * the section number is not in page flags and WANT_PAGE_VIRTUAL
> +	 * is not defined.
> +	 */
> +	for (page = start_page + nr_pages; page-- != start_page; pfn--) {
> +		mm_zero_struct_page(page);
> +
> +		/*
> +		 * We use the start_pfn instead of pfn in the set_page_links
> +		 * call because of the fact that the pfn number is used to
> +		 * get the section_nr and this function should not be
> +		 * spanning more than a single section.
> +		 */
> +		set_page_links(page, zone, nid, start_pfn);
> +		init_page_count(page);
> +		page_mapcount_reset(page);
> +		page_cpupid_reset_last(page);
> +
> +		/*
> +		 * We can use the non-atomic __set_bit operation for setting
> +		 * the flag as we are still initializing the pages.
> +		 */
> +		if (is_reserved)
> +			__SetPageReserved(page);
> +
> +		/*
> +		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
> +		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
> +		 * page is ever freed or placed on a driver-private list.
> +		 */
> +		page->pgmap = pgmap;
> +		if (!pgmap)
> +			INIT_LIST_HEAD(&page->lru);
> +
> +#ifdef WANT_PAGE_VIRTUAL
> +		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
> +		if (!is_highmem)
> +			set_page_address(page, __va(pfn << PAGE_SHIFT));
> +#endif
> +	}
> +
> +	/*
> +	 * Mark the block movable so that blocks are reserved for
> +	 * movable at startup. This will force kernel allocations
> +	 * to reserve their blocks rather than leaking throughout
> +	 * the address space during boot when many long-lived
> +	 * kernel allocations are made.
> +	 *
> +	 * bitmap is created for zone's valid pfn range. but memmap
> +	 * can be created for invalid pages (for alignment)
> +	 * check here not to call set_pageblock_migratetype() against
> +	 * pfn out of zone.
> +	 *
> +	 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
> +	 * because this is done early in sparse_add_one_section
> +	 */
> +	if (!(start_pfn & nr_pgmask))
> +		set_pageblock_migratetype(start_page, MIGRATE_MOVABLE);
> +}
> +
>  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
>  static void __meminit init_reserved_page(unsigned long pfn)
>  {
> @@ -5513,6 +5601,36 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
>  	return false;
>  }
>  
> +static void __meminit __memmap_init_hotplug(unsigned long size, int nid,
> +					    unsigned long zone,
> +					    unsigned long start_pfn,
> +					    struct dev_pagemap *pgmap)
> +{
> +	unsigned long pfn = start_pfn + size;
> +
> +	while (pfn != start_pfn) {
> +		unsigned long stride = pfn;
> +
> +		pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn);
> +		stride -= pfn;
> +
> +		/*
> +		 * The last argument of __init_pageblock is a boolean
> +		 * value indicating if the page will be marked as reserved.
> +		 *
> +		 * Mark page reserved as it will need to wait for onlining
> +		 * phase for it to be fully associated with a zone.
> +		 *
> +		 * Under certain circumstances ZONE_DEVICE pages may not
> +		 * need to be marked as reserved, however there is still
> +		 * code that is depending on this being set for now.
> +		 */
> +		__init_pageblock(pfn, stride, zone, nid, pgmap, true);
> +
> +		cond_resched();
> +	}
> +}
> +
>  /*
>   * Initially all pages are reserved - free ones are freed
>   * up by memblock_free_all() once the early boot process is
> @@ -5523,51 +5641,61 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  		struct vmem_altmap *altmap)
>  {
>  	unsigned long pfn, end_pfn = start_pfn + size;
> -	struct page *page;
>  
>  	if (highest_memmap_pfn < end_pfn - 1)
>  		highest_memmap_pfn = end_pfn - 1;
>  
> +	if (context == MEMMAP_HOTPLUG) {
>  #ifdef CONFIG_ZONE_DEVICE
> -	/*
> -	 * Honor reservation requested by the driver for this ZONE_DEVICE
> -	 * memory. We limit the total number of pages to initialize to just
> -	 * those that might contain the memory mapping. We will defer the
> -	 * ZONE_DEVICE page initialization until after we have released
> -	 * the hotplug lock.
> -	 */
> -	if (zone == ZONE_DEVICE) {
> -		if (!altmap)
> -			return;
> +		/*
> +		 * Honor reservation requested by the driver for this
> +		 * ZONE_DEVICE memory. We limit the total number of pages to
> +		 * initialize to just those that might contain the memory
> +		 * mapping. We will defer the ZONE_DEVICE page initialization
> +		 * until after we have released the hotplug lock.
> +		 */
> +		if (zone == ZONE_DEVICE) {
> +			if (!altmap)
> +				return;
> +
> +			if (start_pfn == altmap->base_pfn)
> +				start_pfn += altmap->reserve;
> +			end_pfn = altmap->base_pfn +
> +				  vmem_altmap_offset(altmap);
> +		}
> +#endif
> +		/*
> +		 * For these ZONE_DEVICE pages we don't need to record the
> +		 * pgmap as they should represent only those pages used to
> +		 * store the memory map. The actual ZONE_DEVICE pages will
> +		 * be initialized later.
> +		 */
> +		__memmap_init_hotplug(end_pfn - start_pfn, nid, zone,
> +				      start_pfn, NULL);
>  
> -		if (start_pfn == altmap->base_pfn)
> -			start_pfn += altmap->reserve;
> -		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
> +		return;
>  	}
> -#endif
>  
>  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +		struct page *page;
> +
>  		/*
>  		 * There can be holes in boot-time mem_map[]s handed to this
>  		 * function.  They do not exist on hotplugged memory.
>  		 */
> -		if (context == MEMMAP_EARLY) {
> -			if (!early_pfn_valid(pfn)) {
> -				pfn = next_valid_pfn(pfn) - 1;
> -				continue;
> -			}
> -			if (!early_pfn_in_nid(pfn, nid))
> -				continue;
> -			if (overlap_memmap_init(zone, &pfn))
> -				continue;
> -			if (defer_init(nid, pfn, end_pfn))
> -				break;
> +		if (!early_pfn_valid(pfn)) {
> +			pfn = next_valid_pfn(pfn) - 1;
> +			continue;
>  		}
> +		if (!early_pfn_in_nid(pfn, nid))
> +			continue;
> +		if (overlap_memmap_init(zone, &pfn))
> +			continue;
> +		if (defer_init(nid, pfn, end_pfn))
> +			break;
>  
>  		page = pfn_to_page(pfn);
>  		__init_single_page(page, pfn, zone, nid);
> -		if (context == MEMMAP_HOTPLUG)
> -			__SetPageReserved(page);
>  
>  		/*
>  		 * Mark the block movable so that blocks are reserved for
> @@ -5594,14 +5722,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
>  				   unsigned long size,
>  				   struct dev_pagemap *pgmap)
>  {
> -	unsigned long pfn, end_pfn = start_pfn + size;
>  	struct pglist_data *pgdat = zone->zone_pgdat;
>  	unsigned long zone_idx = zone_idx(zone);
>  	unsigned long start = jiffies;
>  	int nid = pgdat->node_id;
>  
> -	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
> -		return;
> +	VM_BUG_ON(!is_dev_zone(zone));
>  
>  	/*
>  	 * The call to memmap_init_zone should have already taken care
> @@ -5610,53 +5736,13 @@ void __ref memmap_init_zone_device(struct zone *zone,
>  	 */
>  	if (pgmap->altmap_valid) {
>  		struct vmem_altmap *altmap = &pgmap->altmap;
> +		unsigned long end_pfn = start_pfn + size;
>  
>  		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
>  		size = end_pfn - start_pfn;
>  	}
>  
> -	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		__init_single_page(page, pfn, zone_idx, nid);
> -
> -		/*
> -		 * Mark page reserved as it will need to wait for onlining
> -		 * phase for it to be fully associated with a zone.
> -		 *
> -		 * We can use the non-atomic __set_bit operation for setting
> -		 * the flag as we are still initializing the pages.
> -		 */
> -		__SetPageReserved(page);
> -
> -		/*
> -		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
> -		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
> -		 * page is ever freed or placed on a driver-private list.
> -		 */
> -		page->pgmap = pgmap;
> -		page->hmm_data = 0;
> -
> -		/*
> -		 * Mark the block movable so that blocks are reserved for
> -		 * movable at startup. This will force kernel allocations
> -		 * to reserve their blocks rather than leaking throughout
> -		 * the address space during boot when many long-lived
> -		 * kernel allocations are made.
> -		 *
> -		 * bitmap is created for zone's valid pfn range. but memmap
> -		 * can be created for invalid pages (for alignment)
> -		 * check here not to call set_pageblock_migratetype() against
> -		 * pfn out of zone.
> -		 *
> -		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
> -		 * because this is done early in sparse_add_one_section
> -		 */
> -		if (!(pfn & (pageblock_nr_pages - 1))) {
> -			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
> -			cond_resched();
> -		}
> -	}
> +	__memmap_init_hotplug(size, nid, zone_idx, start_pfn, pgmap);
>  
>  	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
>  		size, jiffies_to_msecs(jiffies - start));
>
Alexander Duyck Oct. 17, 2018, 3:26 p.m. UTC | #2
On 10/17/2018 2:18 AM, Michal Hocko wrote:
> On Mon 15-10-18 13:27:16, Alexander Duyck wrote:
>> This patch is going through and combining the bits in memmap_init_zone and
>> memmap_init_zone_device that are related to hotplug into a single function
>> called __memmap_init_hotplug.
>>
>> I also took the opportunity to integrate __init_single_page's functionality
>> into this function. In doing so I can get rid of some of the redundancy
>> such as the LRU pointers versus the pgmap.
> 
> This patch depends on [1] which I've had some concerns about. It adds
> more code on top. I am still not convinced this is the right direction.
> 
> [1] http://lkml.kernel.org/r/20180925202053.3576.66039.stgit@localhost.localdomain

I wouldn't say this patch depends on it. The fact is I could easily make 
this patch work with the code as it was before that change. This was 
actually something I had started on first and then decided to move until 
later since I knew this was more invasive than the 
memmap_init_zone_device function was.

With that said I am also wondering if a possible solution to the 
complaints you had would be to look at just exporting the 
__init_pageblock function later and moving the call to 
memmap_init_zone_device out to the memremap or hotplug code when Dan 
gets the refactoring for HMM and memremap all sorted out.

>>
>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>> ---
>>   mm/page_alloc.c |  232 ++++++++++++++++++++++++++++++++++++++-----------------
>>   1 file changed, 159 insertions(+), 73 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 20e9eb35d75d..92375e7867ba 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1192,6 +1192,94 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
>>   #endif
>>   }
>>   
>> +static void __meminit __init_pageblock(unsigned long start_pfn,
>> +				       unsigned long nr_pages,
>> +				       unsigned long zone, int nid,
>> +				       struct dev_pagemap *pgmap,
>> +				       bool is_reserved)
>> +{
>> +	unsigned long nr_pgmask = pageblock_nr_pages - 1;
>> +	struct page *start_page = pfn_to_page(start_pfn);
>> +	unsigned long pfn = start_pfn + nr_pages - 1;
>> +#ifdef WANT_PAGE_VIRTUAL
>> +	bool is_highmem = is_highmem_idx(zone);
>> +#endif
>> +	struct page *page;
>> +
>> +	/*
>> +	 * Enforce the following requirements:
>> +	 * size > 0
>> +	 * size < pageblock_nr_pages
>> +	 * start_pfn -> pfn does not cross pageblock_nr_pages boundary
>> +	 */
>> +	VM_BUG_ON(((start_pfn ^ pfn) | (nr_pages - 1)) > nr_pgmask);
>> +
>> +	/*
>> +	 * Work from highest page to lowest, this way we will still be
>> +	 * warm in the cache when we call set_pageblock_migratetype
>> +	 * below.
>> +	 *
>> +	 * The loop is based around the page pointer as the main index
>> +	 * instead of the pfn because pfn is not used inside the loop if
>> +	 * the section number is not in page flags and WANT_PAGE_VIRTUAL
>> +	 * is not defined.
>> +	 */
>> +	for (page = start_page + nr_pages; page-- != start_page; pfn--) {
>> +		mm_zero_struct_page(page);
>> +
>> +		/*
>> +		 * We use the start_pfn instead of pfn in the set_page_links
>> +		 * call because of the fact that the pfn number is used to
>> +		 * get the section_nr and this function should not be
>> +		 * spanning more than a single section.
>> +		 */
>> +		set_page_links(page, zone, nid, start_pfn);
>> +		init_page_count(page);
>> +		page_mapcount_reset(page);
>> +		page_cpupid_reset_last(page);
>> +
>> +		/*
>> +		 * We can use the non-atomic __set_bit operation for setting
>> +		 * the flag as we are still initializing the pages.
>> +		 */
>> +		if (is_reserved)
>> +			__SetPageReserved(page);
>> +
>> +		/*
>> +		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
>> +		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
>> +		 * page is ever freed or placed on a driver-private list.
>> +		 */
>> +		page->pgmap = pgmap;
>> +		if (!pgmap)
>> +			INIT_LIST_HEAD(&page->lru);
>> +
>> +#ifdef WANT_PAGE_VIRTUAL
>> +		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
>> +		if (!is_highmem)
>> +			set_page_address(page, __va(pfn << PAGE_SHIFT));
>> +#endif
>> +	}
>> +
>> +	/*
>> +	 * Mark the block movable so that blocks are reserved for
>> +	 * movable at startup. This will force kernel allocations
>> +	 * to reserve their blocks rather than leaking throughout
>> +	 * the address space during boot when many long-lived
>> +	 * kernel allocations are made.
>> +	 *
>> +	 * bitmap is created for zone's valid pfn range. but memmap
>> +	 * can be created for invalid pages (for alignment)
>> +	 * check here not to call set_pageblock_migratetype() against
>> +	 * pfn out of zone.
>> +	 *
>> +	 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
>> +	 * because this is done early in sparse_add_one_section
>> +	 */
>> +	if (!(start_pfn & nr_pgmask))
>> +		set_pageblock_migratetype(start_page, MIGRATE_MOVABLE);
>> +}
>> +
>>   #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
>>   static void __meminit init_reserved_page(unsigned long pfn)
>>   {
>> @@ -5513,6 +5601,36 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
>>   	return false;
>>   }
>>   
>> +static void __meminit __memmap_init_hotplug(unsigned long size, int nid,
>> +					    unsigned long zone,
>> +					    unsigned long start_pfn,
>> +					    struct dev_pagemap *pgmap)
>> +{
>> +	unsigned long pfn = start_pfn + size;
>> +
>> +	while (pfn != start_pfn) {
>> +		unsigned long stride = pfn;
>> +
>> +		pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn);
>> +		stride -= pfn;
>> +
>> +		/*
>> +		 * The last argument of __init_pageblock is a boolean
>> +		 * value indicating if the page will be marked as reserved.
>> +		 *
>> +		 * Mark page reserved as it will need to wait for onlining
>> +		 * phase for it to be fully associated with a zone.
>> +		 *
>> +		 * Under certain circumstances ZONE_DEVICE pages may not
>> +		 * need to be marked as reserved, however there is still
>> +		 * code that is depending on this being set for now.
>> +		 */
>> +		__init_pageblock(pfn, stride, zone, nid, pgmap, true);
>> +
>> +		cond_resched();
>> +	}
>> +}
>> +
>>   /*
>>    * Initially all pages are reserved - free ones are freed
>>    * up by memblock_free_all() once the early boot process is
>> @@ -5523,51 +5641,61 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>>   		struct vmem_altmap *altmap)
>>   {
>>   	unsigned long pfn, end_pfn = start_pfn + size;
>> -	struct page *page;
>>   
>>   	if (highest_memmap_pfn < end_pfn - 1)
>>   		highest_memmap_pfn = end_pfn - 1;
>>   
>> +	if (context == MEMMAP_HOTPLUG) {
>>   #ifdef CONFIG_ZONE_DEVICE
>> -	/*
>> -	 * Honor reservation requested by the driver for this ZONE_DEVICE
>> -	 * memory. We limit the total number of pages to initialize to just
>> -	 * those that might contain the memory mapping. We will defer the
>> -	 * ZONE_DEVICE page initialization until after we have released
>> -	 * the hotplug lock.
>> -	 */
>> -	if (zone == ZONE_DEVICE) {
>> -		if (!altmap)
>> -			return;
>> +		/*
>> +		 * Honor reservation requested by the driver for this
>> +		 * ZONE_DEVICE memory. We limit the total number of pages to
>> +		 * initialize to just those that might contain the memory
>> +		 * mapping. We will defer the ZONE_DEVICE page initialization
>> +		 * until after we have released the hotplug lock.
>> +		 */
>> +		if (zone == ZONE_DEVICE) {
>> +			if (!altmap)
>> +				return;
>> +
>> +			if (start_pfn == altmap->base_pfn)
>> +				start_pfn += altmap->reserve;
>> +			end_pfn = altmap->base_pfn +
>> +				  vmem_altmap_offset(altmap);
>> +		}
>> +#endif
>> +		/*
>> +		 * For these ZONE_DEVICE pages we don't need to record the
>> +		 * pgmap as they should represent only those pages used to
>> +		 * store the memory map. The actual ZONE_DEVICE pages will
>> +		 * be initialized later.
>> +		 */
>> +		__memmap_init_hotplug(end_pfn - start_pfn, nid, zone,
>> +				      start_pfn, NULL);
>>   
>> -		if (start_pfn == altmap->base_pfn)
>> -			start_pfn += altmap->reserve;
>> -		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
>> +		return;
>>   	}
>> -#endif
>>   
>>   	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>> +		struct page *page;
>> +
>>   		/*
>>   		 * There can be holes in boot-time mem_map[]s handed to this
>>   		 * function.  They do not exist on hotplugged memory.
>>   		 */
>> -		if (context == MEMMAP_EARLY) {
>> -			if (!early_pfn_valid(pfn)) {
>> -				pfn = next_valid_pfn(pfn) - 1;
>> -				continue;
>> -			}
>> -			if (!early_pfn_in_nid(pfn, nid))
>> -				continue;
>> -			if (overlap_memmap_init(zone, &pfn))
>> -				continue;
>> -			if (defer_init(nid, pfn, end_pfn))
>> -				break;
>> +		if (!early_pfn_valid(pfn)) {
>> +			pfn = next_valid_pfn(pfn) - 1;
>> +			continue;
>>   		}
>> +		if (!early_pfn_in_nid(pfn, nid))
>> +			continue;
>> +		if (overlap_memmap_init(zone, &pfn))
>> +			continue;
>> +		if (defer_init(nid, pfn, end_pfn))
>> +			break;
>>   
>>   		page = pfn_to_page(pfn);
>>   		__init_single_page(page, pfn, zone, nid);
>> -		if (context == MEMMAP_HOTPLUG)
>> -			__SetPageReserved(page);
>>   
>>   		/*
>>   		 * Mark the block movable so that blocks are reserved for
>> @@ -5594,14 +5722,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>   				   unsigned long size,
>>   				   struct dev_pagemap *pgmap)
>>   {
>> -	unsigned long pfn, end_pfn = start_pfn + size;
>>   	struct pglist_data *pgdat = zone->zone_pgdat;
>>   	unsigned long zone_idx = zone_idx(zone);
>>   	unsigned long start = jiffies;
>>   	int nid = pgdat->node_id;
>>   
>> -	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
>> -		return;
>> +	VM_BUG_ON(!is_dev_zone(zone));
>>   
>>   	/*
>>   	 * The call to memmap_init_zone should have already taken care
>> @@ -5610,53 +5736,13 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>   	 */
>>   	if (pgmap->altmap_valid) {
>>   		struct vmem_altmap *altmap = &pgmap->altmap;
>> +		unsigned long end_pfn = start_pfn + size;
>>   
>>   		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
>>   		size = end_pfn - start_pfn;
>>   	}
>>   
>> -	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>> -		struct page *page = pfn_to_page(pfn);
>> -
>> -		__init_single_page(page, pfn, zone_idx, nid);
>> -
>> -		/*
>> -		 * Mark page reserved as it will need to wait for onlining
>> -		 * phase for it to be fully associated with a zone.
>> -		 *
>> -		 * We can use the non-atomic __set_bit operation for setting
>> -		 * the flag as we are still initializing the pages.
>> -		 */
>> -		__SetPageReserved(page);
>> -
>> -		/*
>> -		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
>> -		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
>> -		 * page is ever freed or placed on a driver-private list.
>> -		 */
>> -		page->pgmap = pgmap;
>> -		page->hmm_data = 0;
>> -
>> -		/*
>> -		 * Mark the block movable so that blocks are reserved for
>> -		 * movable at startup. This will force kernel allocations
>> -		 * to reserve their blocks rather than leaking throughout
>> -		 * the address space during boot when many long-lived
>> -		 * kernel allocations are made.
>> -		 *
>> -		 * bitmap is created for zone's valid pfn range. but memmap
>> -		 * can be created for invalid pages (for alignment)
>> -		 * check here not to call set_pageblock_migratetype() against
>> -		 * pfn out of zone.
>> -		 *
>> -		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
>> -		 * because this is done early in sparse_add_one_section
>> -		 */
>> -		if (!(pfn & (pageblock_nr_pages - 1))) {
>> -			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
>> -			cond_resched();
>> -		}
>> -	}
>> +	__memmap_init_hotplug(size, nid, zone_idx, start_pfn, pgmap);
>>   
>>   	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
>>   		size, jiffies_to_msecs(jiffies - start));
>>
>
Michal Hocko Oct. 24, 2018, 12:36 p.m. UTC | #3
On Wed 17-10-18 08:26:20, Alexander Duyck wrote:
[...]
> With that said I am also wondering if a possible solution to the complaints
> you had would be to look at just exporting the __init_pageblock function
> later and moving the call to memmap_init_zone_device out to the memremap or
> hotplug code when Dan gets the refactoring for HMM and memremap all sorted
> out.

Why cannot we simply provide a constructor for each page by the caller
if there are special requirements? we currently have alt_map to do
struct page allocation but nothing really prevents to make it more
generic and control both allocation and initialization whatever suits a
specific usecase. I really do not want make special cases here and
there.
Alexander Duyck Oct. 24, 2018, 3:08 p.m. UTC | #4
On Wed, 2018-10-24 at 14:36 +0200, Michal Hocko wrote:
> On Wed 17-10-18 08:26:20, Alexander Duyck wrote:
> [...]
> > With that said I am also wondering if a possible solution to the
> > complaints
> > you had would be to look at just exporting the __init_pageblock
> > function
> > later and moving the call to memmap_init_zone_device out to the
> > memremap or
> > hotplug code when Dan gets the refactoring for HMM and memremap all
> > sorted
> > out.
> 
> Why cannot we simply provide a constructor for each page by the
> caller if there are special requirements? we currently have alt_map
> to do struct page allocation but nothing really prevents to make it
> more generic and control both allocation and initialization whatever
> suits a specific usecase. I really do not want make special cases
> here and there.

The advantage to the current __init_pageblock function is that we end
up constructing everything we are going to write outside of the main
loop and then are focused only on init.

If we end up having to call a per page constructor that is going to
slow things down. For example just that reserved bit setting was adding
something like 4 seconds to the init time for the system. By allowing
it to be configured as a part of the flags outside of the loop I was
able to avoid taking that performance hit.

- Alex
Michal Hocko Oct. 24, 2018, 3:27 p.m. UTC | #5
On Wed 24-10-18 08:08:41, Alexander Duyck wrote:
> On Wed, 2018-10-24 at 14:36 +0200, Michal Hocko wrote:
> > On Wed 17-10-18 08:26:20, Alexander Duyck wrote:
> > [...]
> > > With that said I am also wondering if a possible solution to the
> > > complaints
> > > you had would be to look at just exporting the __init_pageblock
> > > function
> > > later and moving the call to memmap_init_zone_device out to the
> > > memremap or
> > > hotplug code when Dan gets the refactoring for HMM and memremap all
> > > sorted
> > > out.
> > 
> > Why cannot we simply provide a constructor for each page by the
> > caller if there are special requirements? we currently have alt_map
> > to do struct page allocation but nothing really prevents to make it
> > more generic and control both allocation and initialization whatever
> > suits a specific usecase. I really do not want make special cases
> > here and there.
> 
> The advantage to the current __init_pageblock function is that we end
> up constructing everything we are going to write outside of the main
> loop and then are focused only on init.

But we do really want move_pfn_range_to_zone to provide a usable pfn
range without any additional tweaks. If there are potential
optimizations to be done there then let's do it but please do not try to
micro optimize to the point that the interface doesn't make any sense
anymore.
Alexander Duyck Oct. 24, 2018, 5:35 p.m. UTC | #6
On Wed, 2018-10-24 at 17:27 +0200, Michal Hocko wrote:
> On Wed 24-10-18 08:08:41, Alexander Duyck wrote:
> > On Wed, 2018-10-24 at 14:36 +0200, Michal Hocko wrote:
> > > On Wed 17-10-18 08:26:20, Alexander Duyck wrote:
> > > [...]
> > > > With that said I am also wondering if a possible solution to
> > > > the complaints you had would be to look at just exporting the
> > > > __init_pageblock function later and moving the call to
> > > > memmap_init_zone_device out to the memremap or hotplug code
> > > > when Dan gets the refactoring for HMM and memremap all sorted
> > > > out.
> > > 
> > > Why cannot we simply provide a constructor for each page by the
> > > caller if there are special requirements? we currently have
> > > alt_map
> > > to do struct page allocation but nothing really prevents to make
> > > it
> > > more generic and control both allocation and initialization
> > > whatever
> > > suits a specific usecase. I really do not want make special cases
> > > here and there.
> > 
> > The advantage to the current __init_pageblock function is that we
> > end up constructing everything we are going to write outside of the
> > main loop and then are focused only on init.
> 
> But we do really want move_pfn_range_to_zone to provide a usable pfn
> range without any additional tweaks. If there are potential
> optimizations to be done there then let's do it but please do not try
> to micro optimize to the point that the interface doesn't make any
> sense anymore.

The actual difference between the two setups is not all that great.
From the sound of things the ultimate difference between the
ZONE_DEVICE pages and regular pages is the pgmap and if we want the
reserved bit set or not.

What I am providing with __init_pageblock at this point is a function
that is flexible enough for us to be able to do either one and then
just expose a different front end on it for the specific type of page
we have to initialize. It works for regular hotplug, ZONE_DEVICE, and
deferred memory initialization. The way I view it is that this funciton
is a high performance multi-tasker, not something that is micro-
optimized for any one specific function.

Thanks.

- Alex
Michal Hocko Oct. 25, 2018, 12:41 p.m. UTC | #7
On Wed 24-10-18 10:35:09, Alexander Duyck wrote:
> On Wed, 2018-10-24 at 17:27 +0200, Michal Hocko wrote:
> > On Wed 24-10-18 08:08:41, Alexander Duyck wrote:
> > > On Wed, 2018-10-24 at 14:36 +0200, Michal Hocko wrote:
> > > > On Wed 17-10-18 08:26:20, Alexander Duyck wrote:
> > > > [...]
> > > > > With that said I am also wondering if a possible solution to
> > > > > the complaints you had would be to look at just exporting the
> > > > > __init_pageblock function later and moving the call to
> > > > > memmap_init_zone_device out to the memremap or hotplug code
> > > > > when Dan gets the refactoring for HMM and memremap all sorted
> > > > > out.
> > > > 
> > > > Why cannot we simply provide a constructor for each page by the
> > > > caller if there are special requirements? we currently have
> > > > alt_map
> > > > to do struct page allocation but nothing really prevents to make
> > > > it
> > > > more generic and control both allocation and initialization
> > > > whatever
> > > > suits a specific usecase. I really do not want make special cases
> > > > here and there.
> > > 
> > > The advantage to the current __init_pageblock function is that we
> > > end up constructing everything we are going to write outside of the
> > > main loop and then are focused only on init.
> > 
> > But we do really want move_pfn_range_to_zone to provide a usable pfn
> > range without any additional tweaks. If there are potential
> > optimizations to be done there then let's do it but please do not try
> > to micro optimize to the point that the interface doesn't make any
> > sense anymore.
> 
> The actual difference between the two setups is not all that great.
> >From the sound of things the ultimate difference between the
> ZONE_DEVICE pages and regular pages is the pgmap and if we want the
> reserved bit set or not.

So once again, why do you need PageReserved in the first place. Alos it
seems that pgmap can hold both the struct page allocator and
constructor.

> What I am providing with __init_pageblock at this point is a function
> that is flexible enough for us to be able to do either one and then
> just expose a different front end on it for the specific type of page
> we have to initialize. It works for regular hotplug, ZONE_DEVICE, and
> deferred memory initialization. The way I view it is that this funciton
> is a high performance multi-tasker, not something that is micro-
> optimized for any one specific function.

All that I argue about is that all this should live inside the core
hotplug. If you can plumb it there without hacks I have seen earlier
then I do not mind but let's try to keep the core infrastructure as
clelan as possible.
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20e9eb35d75d..92375e7867ba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1192,6 +1192,94 @@  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 #endif
 }
 
+static void __meminit __init_pageblock(unsigned long start_pfn,
+				       unsigned long nr_pages,
+				       unsigned long zone, int nid,
+				       struct dev_pagemap *pgmap,
+				       bool is_reserved)
+{
+	unsigned long nr_pgmask = pageblock_nr_pages - 1;
+	struct page *start_page = pfn_to_page(start_pfn);
+	unsigned long pfn = start_pfn + nr_pages - 1;
+#ifdef WANT_PAGE_VIRTUAL
+	bool is_highmem = is_highmem_idx(zone);
+#endif
+	struct page *page;
+
+	/*
+	 * Enforce the following requirements:
+	 * size > 0
+	 * size < pageblock_nr_pages
+	 * start_pfn -> pfn does not cross pageblock_nr_pages boundary
+	 */
+	VM_BUG_ON(((start_pfn ^ pfn) | (nr_pages - 1)) > nr_pgmask);
+
+	/*
+	 * Work from highest page to lowest, this way we will still be
+	 * warm in the cache when we call set_pageblock_migratetype
+	 * below.
+	 *
+	 * The loop is based around the page pointer as the main index
+	 * instead of the pfn because pfn is not used inside the loop if
+	 * the section number is not in page flags and WANT_PAGE_VIRTUAL
+	 * is not defined.
+	 */
+	for (page = start_page + nr_pages; page-- != start_page; pfn--) {
+		mm_zero_struct_page(page);
+
+		/*
+		 * We use the start_pfn instead of pfn in the set_page_links
+		 * call because of the fact that the pfn number is used to
+		 * get the section_nr and this function should not be
+		 * spanning more than a single section.
+		 */
+		set_page_links(page, zone, nid, start_pfn);
+		init_page_count(page);
+		page_mapcount_reset(page);
+		page_cpupid_reset_last(page);
+
+		/*
+		 * We can use the non-atomic __set_bit operation for setting
+		 * the flag as we are still initializing the pages.
+		 */
+		if (is_reserved)
+			__SetPageReserved(page);
+
+		/*
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
+		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
+		 * page is ever freed or placed on a driver-private list.
+		 */
+		page->pgmap = pgmap;
+		if (!pgmap)
+			INIT_LIST_HEAD(&page->lru);
+
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem)
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+	}
+
+	/*
+	 * Mark the block movable so that blocks are reserved for
+	 * movable at startup. This will force kernel allocations
+	 * to reserve their blocks rather than leaking throughout
+	 * the address space during boot when many long-lived
+	 * kernel allocations are made.
+	 *
+	 * bitmap is created for zone's valid pfn range. but memmap
+	 * can be created for invalid pages (for alignment)
+	 * check here not to call set_pageblock_migratetype() against
+	 * pfn out of zone.
+	 *
+	 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+	 * because this is done early in sparse_add_one_section
+	 */
+	if (!(start_pfn & nr_pgmask))
+		set_pageblock_migratetype(start_page, MIGRATE_MOVABLE);
+}
+
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __meminit init_reserved_page(unsigned long pfn)
 {
@@ -5513,6 +5601,36 @@  void __ref build_all_zonelists(pg_data_t *pgdat)
 	return false;
 }
 
+static void __meminit __memmap_init_hotplug(unsigned long size, int nid,
+					    unsigned long zone,
+					    unsigned long start_pfn,
+					    struct dev_pagemap *pgmap)
+{
+	unsigned long pfn = start_pfn + size;
+
+	while (pfn != start_pfn) {
+		unsigned long stride = pfn;
+
+		pfn = max(ALIGN_DOWN(pfn - 1, pageblock_nr_pages), start_pfn);
+		stride -= pfn;
+
+		/*
+		 * The last argument of __init_pageblock is a boolean
+		 * value indicating if the page will be marked as reserved.
+		 *
+		 * Mark page reserved as it will need to wait for onlining
+		 * phase for it to be fully associated with a zone.
+		 *
+		 * Under certain circumstances ZONE_DEVICE pages may not
+		 * need to be marked as reserved, however there is still
+		 * code that is depending on this being set for now.
+		 */
+		__init_pageblock(pfn, stride, zone, nid, pgmap, true);
+
+		cond_resched();
+	}
+}
+
 /*
  * Initially all pages are reserved - free ones are freed
  * up by memblock_free_all() once the early boot process is
@@ -5523,51 +5641,61 @@  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		struct vmem_altmap *altmap)
 {
 	unsigned long pfn, end_pfn = start_pfn + size;
-	struct page *page;
 
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 
+	if (context == MEMMAP_HOTPLUG) {
 #ifdef CONFIG_ZONE_DEVICE
-	/*
-	 * Honor reservation requested by the driver for this ZONE_DEVICE
-	 * memory. We limit the total number of pages to initialize to just
-	 * those that might contain the memory mapping. We will defer the
-	 * ZONE_DEVICE page initialization until after we have released
-	 * the hotplug lock.
-	 */
-	if (zone == ZONE_DEVICE) {
-		if (!altmap)
-			return;
+		/*
+		 * Honor reservation requested by the driver for this
+		 * ZONE_DEVICE memory. We limit the total number of pages to
+		 * initialize to just those that might contain the memory
+		 * mapping. We will defer the ZONE_DEVICE page initialization
+		 * until after we have released the hotplug lock.
+		 */
+		if (zone == ZONE_DEVICE) {
+			if (!altmap)
+				return;
+
+			if (start_pfn == altmap->base_pfn)
+				start_pfn += altmap->reserve;
+			end_pfn = altmap->base_pfn +
+				  vmem_altmap_offset(altmap);
+		}
+#endif
+		/*
+		 * For these ZONE_DEVICE pages we don't need to record the
+		 * pgmap as they should represent only those pages used to
+		 * store the memory map. The actual ZONE_DEVICE pages will
+		 * be initialized later.
+		 */
+		__memmap_init_hotplug(end_pfn - start_pfn, nid, zone,
+				      start_pfn, NULL);
 
-		if (start_pfn == altmap->base_pfn)
-			start_pfn += altmap->reserve;
-		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+		return;
 	}
-#endif
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		struct page *page;
+
 		/*
 		 * There can be holes in boot-time mem_map[]s handed to this
 		 * function.  They do not exist on hotplugged memory.
 		 */
-		if (context == MEMMAP_EARLY) {
-			if (!early_pfn_valid(pfn)) {
-				pfn = next_valid_pfn(pfn) - 1;
-				continue;
-			}
-			if (!early_pfn_in_nid(pfn, nid))
-				continue;
-			if (overlap_memmap_init(zone, &pfn))
-				continue;
-			if (defer_init(nid, pfn, end_pfn))
-				break;
+		if (!early_pfn_valid(pfn)) {
+			pfn = next_valid_pfn(pfn) - 1;
+			continue;
 		}
+		if (!early_pfn_in_nid(pfn, nid))
+			continue;
+		if (overlap_memmap_init(zone, &pfn))
+			continue;
+		if (defer_init(nid, pfn, end_pfn))
+			break;
 
 		page = pfn_to_page(pfn);
 		__init_single_page(page, pfn, zone, nid);
-		if (context == MEMMAP_HOTPLUG)
-			__SetPageReserved(page);
 
 		/*
 		 * Mark the block movable so that blocks are reserved for
@@ -5594,14 +5722,12 @@  void __ref memmap_init_zone_device(struct zone *zone,
 				   unsigned long size,
 				   struct dev_pagemap *pgmap)
 {
-	unsigned long pfn, end_pfn = start_pfn + size;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
 
-	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
-		return;
+	VM_BUG_ON(!is_dev_zone(zone));
 
 	/*
 	 * The call to memmap_init_zone should have already taken care
@@ -5610,53 +5736,13 @@  void __ref memmap_init_zone_device(struct zone *zone,
 	 */
 	if (pgmap->altmap_valid) {
 		struct vmem_altmap *altmap = &pgmap->altmap;
+		unsigned long end_pfn = start_pfn + size;
 
 		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
 		size = end_pfn - start_pfn;
 	}
 
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-		struct page *page = pfn_to_page(pfn);
-
-		__init_single_page(page, pfn, zone_idx, nid);
-
-		/*
-		 * Mark page reserved as it will need to wait for onlining
-		 * phase for it to be fully associated with a zone.
-		 *
-		 * We can use the non-atomic __set_bit operation for setting
-		 * the flag as we are still initializing the pages.
-		 */
-		__SetPageReserved(page);
-
-		/*
-		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
-		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
-		 * page is ever freed or placed on a driver-private list.
-		 */
-		page->pgmap = pgmap;
-		page->hmm_data = 0;
-
-		/*
-		 * Mark the block movable so that blocks are reserved for
-		 * movable at startup. This will force kernel allocations
-		 * to reserve their blocks rather than leaking throughout
-		 * the address space during boot when many long-lived
-		 * kernel allocations are made.
-		 *
-		 * bitmap is created for zone's valid pfn range. but memmap
-		 * can be created for invalid pages (for alignment)
-		 * check here not to call set_pageblock_migratetype() against
-		 * pfn out of zone.
-		 *
-		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
-		 * because this is done early in sparse_add_one_section
-		 */
-		if (!(pfn & (pageblock_nr_pages - 1))) {
-			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-			cond_resched();
-		}
-	}
+	__memmap_init_hotplug(size, nid, zone_idx, start_pfn, pgmap);
 
 	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
 		size, jiffies_to_msecs(jiffies - start));