diff mbox series

[v4,2/2] mm: Init page count in reserve_bootmem_region when MEMINIT_EARLY

Message ID 20230928083302.386202-3-yajun.deng@linux.dev (mailing list archive)
State New
Headers show
Series mm: Don't set and reset page count in MEMINIT_EARLY | expand

Commit Message

Yajun Deng Sept. 28, 2023, 8:33 a.m. UTC
memmap_init_range() would init page count of all pages, but the free
pages count would be reset in __free_pages_core(). There are opposite
operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
context.

Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
and check the page count before reset it.

At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
need, as it already done in __init_single_page.

The following data was tested on an x86 machine with 190GB of RAM.

before:
free_low_memory_core_early()    341ms

after:
free_low_memory_core_early()    285ms

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
---
v4: same with v2.
v3: same with v2.
v2: check page count instead of check context before reset it.
v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
---
 mm/mm_init.c    | 18 +++++++++++++-----
 mm/page_alloc.c | 20 ++++++++++++--------
 2 files changed, 25 insertions(+), 13 deletions(-)

Comments

Mike Rapoport Sept. 29, 2023, 8:30 a.m. UTC | #1
On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
> memmap_init_range() would init page count of all pages, but the free
> pages count would be reset in __free_pages_core(). There are opposite
> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
> context.
> 
> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
> and check the page count before reset it.
> 
> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
> need, as it already done in __init_single_page.
> 
> The following data was tested on an x86 machine with 190GB of RAM.
> 
> before:
> free_low_memory_core_early()    341ms
> 
> after:
> free_low_memory_core_early()    285ms
> 
> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> ---
> v4: same with v2.
> v3: same with v2.
> v2: check page count instead of check context before reset it.
> v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
> ---
>  mm/mm_init.c    | 18 +++++++++++++-----
>  mm/page_alloc.c | 20 ++++++++++++--------
>  2 files changed, 25 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 9716c8a7ade9..3ab8861e1ef3 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>  		if (zone_spans_pfn(zone, pfn))
>  			break;
>  	}
> -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
> +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>  }
>  #else
>  static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>  
>  			init_reserved_page(start_pfn, nid);
>  
> -			/* Avoid false-positive PageTail() */
> -			INIT_LIST_HEAD(&page->lru);
> +			/* Init page count for reserved region */

Please add a comment that describes _why_ we initialize the page count here.

> +			init_page_count(page);
>  
>  			/*
>  			 * no need for atomic set_bit because the struct
> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>  		}
>  
>  		page = pfn_to_page(pfn);
> -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
> -		if (context == MEMINIT_HOTPLUG)
> +
> +		/* If the context is MEMINIT_EARLY, we will init page count and
> +		 * mark page reserved in reserve_bootmem_region, the free region
> +		 * wouldn't have page count and we will check the pages count
> +		 * in __free_pages_core.
> +		 */
> +		__init_single_page(page, pfn, zone, nid, 0);
> +		if (context == MEMINIT_HOTPLUG) {
> +			init_page_count(page);
>  			__SetPageReserved(page);

Rather than calling init_page_count() and __SetPageReserved() for
MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
an call __init_single_page() after the check for MEMINIT_HOTPLUG.

But more generally, I wonder if we have to differentiate HOTPLUG here at all.
@David, can you comment please?

> +		}
>  
>  		/*
>  		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 06be8821d833..b868caabe8dc 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>  	unsigned int loop;
>  
>  	/*
> -	 * When initializing the memmap, __init_single_page() sets the refcount
> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
> -	 * refcount of all involved pages to 0.
> +	 * When initializing the memmap, memmap_init_range sets the refcount
> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
> +	 * have to set the refcount of all involved pages to 0. Otherwise,
> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
> +	 * reserve region ("reserved") in early context.
>  	 */

Again, why hotplug and early init should be different?

> -	prefetchw(p);
> -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> -		prefetchw(p + 1);
> +	if (page_count(page)) {
> +		prefetchw(p);
> +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> +			prefetchw(p + 1);
> +			__ClearPageReserved(p);
> +			set_page_count(p, 0);
> +		}
>  		__ClearPageReserved(p);
>  		set_page_count(p, 0);
>  	}
> -	__ClearPageReserved(p);
> -	set_page_count(p, 0);
>  
>  	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>  
> -- 
> 2.25.1
>
Yajun Deng Sept. 29, 2023, 9:50 a.m. UTC | #2
On 2023/9/29 16:30, Mike Rapoport wrote:
> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>> memmap_init_range() would init page count of all pages, but the free
>> pages count would be reset in __free_pages_core(). There are opposite
>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>> context.
>>
>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>> and check the page count before reset it.
>>
>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>> need, as it already done in __init_single_page.
>>
>> The following data was tested on an x86 machine with 190GB of RAM.
>>
>> before:
>> free_low_memory_core_early()    341ms
>>
>> after:
>> free_low_memory_core_early()    285ms
>>
>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>> ---
>> v4: same with v2.
>> v3: same with v2.
>> v2: check page count instead of check context before reset it.
>> v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>> ---
>>   mm/mm_init.c    | 18 +++++++++++++-----
>>   mm/page_alloc.c | 20 ++++++++++++--------
>>   2 files changed, 25 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 9716c8a7ade9..3ab8861e1ef3 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>>   		if (zone_spans_pfn(zone, pfn))
>>   			break;
>>   	}
>> -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>> +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>   }
>>   #else
>>   static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>   
>>   			init_reserved_page(start_pfn, nid);
>>   
>> -			/* Avoid false-positive PageTail() */
>> -			INIT_LIST_HEAD(&page->lru);
>> +			/* Init page count for reserved region */
> Please add a comment that describes _why_ we initialize the page count here.
Okay.
>
>> +			init_page_count(page);
>>   
>>   			/*
>>   			 * no need for atomic set_bit because the struct
>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>>   		}
>>   
>>   		page = pfn_to_page(pfn);
>> -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>> -		if (context == MEMINIT_HOTPLUG)
>> +
>> +		/* If the context is MEMINIT_EARLY, we will init page count and
>> +		 * mark page reserved in reserve_bootmem_region, the free region
>> +		 * wouldn't have page count and we will check the pages count
>> +		 * in __free_pages_core.
>> +		 */
>> +		__init_single_page(page, pfn, zone, nid, 0);
>> +		if (context == MEMINIT_HOTPLUG) {
>> +			init_page_count(page);
>>   			__SetPageReserved(page);
> Rather than calling init_page_count() and __SetPageReserved() for
> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> an call __init_single_page() after the check for MEMINIT_HOTPLUG.

No, the following code would cost more time than the current code in 
memmap_init().

if (context == MEMINIT_HOTPLUG)

	__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
else
	
	__init_single_page(page, pfn, zone, nid, 0);

> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> @David, can you comment please?
>
>> +		}
>>   
>>   		/*
>>   		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 06be8821d833..b868caabe8dc 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>   	unsigned int loop;
>>   
>>   	/*
>> -	 * When initializing the memmap, __init_single_page() sets the refcount
>> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
>> -	 * refcount of all involved pages to 0.
>> +	 * When initializing the memmap, memmap_init_range sets the refcount
>> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
>> +	 * have to set the refcount of all involved pages to 0. Otherwise,
>> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
>> +	 * reserve region ("reserved") in early context.
>>   	 */
> Again, why hotplug and early init should be different?
I will add a comment that describes it will save boot time.
>
>> -	prefetchw(p);
>> -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> -		prefetchw(p + 1);
>> +	if (page_count(page)) {
>> +		prefetchw(p);
>> +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> +			prefetchw(p + 1);
>> +			__ClearPageReserved(p);
>> +			set_page_count(p, 0);
>> +		}
>>   		__ClearPageReserved(p);
>>   		set_page_count(p, 0);
>>   	}
>> -	__ClearPageReserved(p);
>> -	set_page_count(p, 0);
>>   
>>   	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>   
>> -- 
>> 2.25.1
>>
Mike Rapoport Sept. 29, 2023, 10:02 a.m. UTC | #3
On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
> 
> On 2023/9/29 16:30, Mike Rapoport wrote:
> > On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
> > > memmap_init_range() would init page count of all pages, but the free
> > > pages count would be reset in __free_pages_core(). There are opposite
> > > operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
> > > context.
> > > 
> > > Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
> > > and check the page count before reset it.
> > > 
> > > At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
> > > need, as it already done in __init_single_page.
> > > 
> > > The following data was tested on an x86 machine with 190GB of RAM.
> > > 
> > > before:
> > > free_low_memory_core_early()    341ms
> > > 
> > > after:
> > > free_low_memory_core_early()    285ms
> > > 
> > > Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> > > ---
> > > v4: same with v2.
> > > v3: same with v2.
> > > v2: check page count instead of check context before reset it.
> > > v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
> > > ---
> > >   mm/mm_init.c    | 18 +++++++++++++-----
> > >   mm/page_alloc.c | 20 ++++++++++++--------
> > >   2 files changed, 25 insertions(+), 13 deletions(-)
> > > 
> > > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > > index 9716c8a7ade9..3ab8861e1ef3 100644
> > > --- a/mm/mm_init.c
> > > +++ b/mm/mm_init.c
> > > @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
> > >   		if (zone_spans_pfn(zone, pfn))
> > >   			break;
> > >   	}
> > > -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
> > > +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
> > >   }
> > >   #else
> > >   static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
> > > @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
> > >   			init_reserved_page(start_pfn, nid);
> > > -			/* Avoid false-positive PageTail() */
> > > -			INIT_LIST_HEAD(&page->lru);
> > > +			/* Init page count for reserved region */
> > Please add a comment that describes _why_ we initialize the page count here.
> Okay.
> > 
> > > +			init_page_count(page);
> > >   			/*
> > >   			 * no need for atomic set_bit because the struct
> > > @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
> > >   		}
> > >   		page = pfn_to_page(pfn);
> > > -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
> > > -		if (context == MEMINIT_HOTPLUG)
> > > +
> > > +		/* If the context is MEMINIT_EARLY, we will init page count and
> > > +		 * mark page reserved in reserve_bootmem_region, the free region
> > > +		 * wouldn't have page count and we will check the pages count
> > > +		 * in __free_pages_core.
> > > +		 */
> > > +		__init_single_page(page, pfn, zone, nid, 0);
> > > +		if (context == MEMINIT_HOTPLUG) {
> > > +			init_page_count(page);
> > >   			__SetPageReserved(page);
> > Rather than calling init_page_count() and __SetPageReserved() for
> > MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> > an call __init_single_page() after the check for MEMINIT_HOTPLUG.
> 
> No, the following code would cost more time than the current code in
> memmap_init().
> 
> if (context == MEMINIT_HOTPLUG)
> 
> 	__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
> else
> 	
> 	__init_single_page(page, pfn, zone, nid, 0);

Sorry if I wasn't clear. What I meant was to have something along these lines:

	enum page_init_flags flags = 0;

	if (context == MEMINIT_HOTPLUG)
		flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
	__init_single_page(page, pfn, zone, nid, flags);

> > But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> > @David, can you comment please?
> > 
> > > +		}
> > >   		/*
> > >   		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 06be8821d833..b868caabe8dc 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > >   	unsigned int loop;
> > >   	/*
> > > -	 * When initializing the memmap, __init_single_page() sets the refcount
> > > -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > -	 * refcount of all involved pages to 0.
> > > +	 * When initializing the memmap, memmap_init_range sets the refcount
> > > +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > +	 * have to set the refcount of all involved pages to 0. Otherwise,
> > > +	 * we don't do it, as reserve_bootmem_region only set the refcount on
> > > +	 * reserve region ("reserved") in early context.
> > >   	 */
> > Again, why hotplug and early init should be different?
> I will add a comment that describes it will save boot time.

But why do we need initialize struct pages differently at boot time vs
memory hotplug?
Is there a reason memory hotplug cannot have page count set to 0 just like
for pages reserved at boot time?
 
> > > -	prefetchw(p);
> > > -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > -		prefetchw(p + 1);
> > > +	if (page_count(page)) {
> > > +		prefetchw(p);
> > > +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > +			prefetchw(p + 1);
> > > +			__ClearPageReserved(p);
> > > +			set_page_count(p, 0);
> > > +		}
> > >   		__ClearPageReserved(p);
> > >   		set_page_count(p, 0);
> > >   	}
> > > -	__ClearPageReserved(p);
> > > -	set_page_count(p, 0);
> > >   	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
> > > -- 
> > > 2.25.1
> > >
Yajun Deng Sept. 29, 2023, 10:27 a.m. UTC | #4
On 2023/9/29 18:02, Mike Rapoport wrote:
> On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
>> On 2023/9/29 16:30, Mike Rapoport wrote:
>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>> memmap_init_range() would init page count of all pages, but the free
>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>>>> context.
>>>>
>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>>>> and check the page count before reset it.
>>>>
>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>> need, as it already done in __init_single_page.
>>>>
>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>
>>>> before:
>>>> free_low_memory_core_early()    341ms
>>>>
>>>> after:
>>>> free_low_memory_core_early()    285ms
>>>>
>>>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>>>> ---
>>>> v4: same with v2.
>>>> v3: same with v2.
>>>> v2: check page count instead of check context before reset it.
>>>> v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>> ---
>>>>    mm/mm_init.c    | 18 +++++++++++++-----
>>>>    mm/page_alloc.c | 20 ++++++++++++--------
>>>>    2 files changed, 25 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>> --- a/mm/mm_init.c
>>>> +++ b/mm/mm_init.c
>>>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>>>>    		if (zone_spans_pfn(zone, pfn))
>>>>    			break;
>>>>    	}
>>>> -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>>>> +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>    }
>>>>    #else
>>>>    static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>>>    			init_reserved_page(start_pfn, nid);
>>>> -			/* Avoid false-positive PageTail() */
>>>> -			INIT_LIST_HEAD(&page->lru);
>>>> +			/* Init page count for reserved region */
>>> Please add a comment that describes _why_ we initialize the page count here.
>> Okay.
>>>> +			init_page_count(page);
>>>>    			/*
>>>>    			 * no need for atomic set_bit because the struct
>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>>>>    		}
>>>>    		page = pfn_to_page(pfn);
>>>> -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>> -		if (context == MEMINIT_HOTPLUG)
>>>> +
>>>> +		/* If the context is MEMINIT_EARLY, we will init page count and
>>>> +		 * mark page reserved in reserve_bootmem_region, the free region
>>>> +		 * wouldn't have page count and we will check the pages count
>>>> +		 * in __free_pages_core.
>>>> +		 */
>>>> +		__init_single_page(page, pfn, zone, nid, 0);
>>>> +		if (context == MEMINIT_HOTPLUG) {
>>>> +			init_page_count(page);
>>>>    			__SetPageReserved(page);
>>> Rather than calling init_page_count() and __SetPageReserved() for
>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>> No, the following code would cost more time than the current code in
>> memmap_init().
>>
>> if (context == MEMINIT_HOTPLUG)
>>
>> 	__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
>> else
>> 	
>> 	__init_single_page(page, pfn, zone, nid, 0);
> Sorry if I wasn't clear. What I meant was to have something along these lines:
>
> 	enum page_init_flags flags = 0;
>
> 	if (context == MEMINIT_HOTPLUG)
> 		flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
> 	__init_single_page(page, pfn, zone, nid, flags);
>
Okay, I'll test the time consumed in memmap_init().
>>> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
>>> @David, can you comment please?
>>>
>>>> +		}
>>>>    		/*
>>>>    		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 06be8821d833..b868caabe8dc 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>    	unsigned int loop;
>>>>    	/*
>>>> -	 * When initializing the memmap, __init_single_page() sets the refcount
>>>> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>> -	 * refcount of all involved pages to 0.
>>>> +	 * When initializing the memmap, memmap_init_range sets the refcount
>>>> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>> +	 * have to set the refcount of all involved pages to 0. Otherwise,
>>>> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
>>>> +	 * reserve region ("reserved") in early context.
>>>>    	 */
>>> Again, why hotplug and early init should be different?
>> I will add a comment that describes it will save boot time.
> But why do we need initialize struct pages differently at boot time vs
> memory hotplug?
> Is there a reason memory hotplug cannot have page count set to 0 just like
> for pages reserved at boot time?
>   

This patch just save boot time in MEMINIT_EARLY. If someone finds out 
that it can save time in

MEMINIT_HOTPLUG, I think it can be done in another patch later. I just 
keeping it in the same.

>>>> -	prefetchw(p);
>>>> -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> -		prefetchw(p + 1);
>>>> +	if (page_count(page)) {
>>>> +		prefetchw(p);
>>>> +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> +			prefetchw(p + 1);
>>>> +			__ClearPageReserved(p);
>>>> +			set_page_count(p, 0);
>>>> +		}
>>>>    		__ClearPageReserved(p);
>>>>    		set_page_count(p, 0);
>>>>    	}
>>>> -	__ClearPageReserved(p);
>>>> -	set_page_count(p, 0);
>>>>    	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>>> -- 
>>>> 2.25.1
>>>>
Mike Rapoport Oct. 1, 2023, 6:59 p.m. UTC | #5
On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
> 
> On 2023/9/29 18:02, Mike Rapoport wrote:
> > On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
> > > On 2023/9/29 16:30, Mike Rapoport wrote:
> > > > On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
> > > > > memmap_init_range() would init page count of all pages, but the free
> > > > > pages count would be reset in __free_pages_core(). There are opposite
> > > > > operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
> > > > > context.
> > > > > 
> > > > > Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
> > > > > and check the page count before reset it.
> > > > > 
> > > > > At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
> > > > > need, as it already done in __init_single_page.
> > > > > 
> > > > > The following data was tested on an x86 machine with 190GB of RAM.
> > > > > 
> > > > > before:
> > > > > free_low_memory_core_early()    341ms
> > > > > 
> > > > > after:
> > > > > free_low_memory_core_early()    285ms
> > > > > 
> > > > > Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> > > > > ---
> > > > > v4: same with v2.
> > > > > v3: same with v2.
> > > > > v2: check page count instead of check context before reset it.
> > > > > v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
> > > > > ---
> > > > >    mm/mm_init.c    | 18 +++++++++++++-----
> > > > >    mm/page_alloc.c | 20 ++++++++++++--------
> > > > >    2 files changed, 25 insertions(+), 13 deletions(-)
> > > > > 
> > > > > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > > > > index 9716c8a7ade9..3ab8861e1ef3 100644
> > > > > --- a/mm/mm_init.c
> > > > > +++ b/mm/mm_init.c
> > > > > @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
> > > > >    		if (zone_spans_pfn(zone, pfn))
> > > > >    			break;
> > > > >    	}
> > > > > -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
> > > > > +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
> > > > >    }
> > > > >    #else
> > > > >    static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
> > > > > @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
> > > > >    			init_reserved_page(start_pfn, nid);
> > > > > -			/* Avoid false-positive PageTail() */
> > > > > -			INIT_LIST_HEAD(&page->lru);
> > > > > +			/* Init page count for reserved region */
> > > > Please add a comment that describes _why_ we initialize the page count here.
> > > Okay.
> > > > > +			init_page_count(page);
> > > > >    			/*
> > > > >    			 * no need for atomic set_bit because the struct
> > > > > @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
> > > > >    		}
> > > > >    		page = pfn_to_page(pfn);
> > > > > -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
> > > > > -		if (context == MEMINIT_HOTPLUG)
> > > > > +
> > > > > +		/* If the context is MEMINIT_EARLY, we will init page count and
> > > > > +		 * mark page reserved in reserve_bootmem_region, the free region
> > > > > +		 * wouldn't have page count and we will check the pages count
> > > > > +		 * in __free_pages_core.
> > > > > +		 */
> > > > > +		__init_single_page(page, pfn, zone, nid, 0);
> > > > > +		if (context == MEMINIT_HOTPLUG) {
> > > > > +			init_page_count(page);
> > > > >    			__SetPageReserved(page);
> > > > Rather than calling init_page_count() and __SetPageReserved() for
> > > > MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> > > > an call __init_single_page() after the check for MEMINIT_HOTPLUG.
> > > No, the following code would cost more time than the current code in
> > > memmap_init().
> > > 
> > > if (context == MEMINIT_HOTPLUG)
> > > 
> > > 	__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
> > > else
> > > 	
> > > 	__init_single_page(page, pfn, zone, nid, 0);
> > Sorry if I wasn't clear. What I meant was to have something along these lines:
> > 
> > 	enum page_init_flags flags = 0;
> > 
> > 	if (context == MEMINIT_HOTPLUG)
> > 		flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
> > 	__init_single_page(page, pfn, zone, nid, flags);
> > 
> Okay, I'll test the time consumed in memmap_init().
> > > > But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> > > > @David, can you comment please?
> > > > 
> > > > > +		}
> > > > >    		/*
> > > > >    		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
> > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > index 06be8821d833..b868caabe8dc 100644
> > > > > --- a/mm/page_alloc.c
> > > > > +++ b/mm/page_alloc.c
> > > > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > > >    	unsigned int loop;
> > > > >    	/*
> > > > > -	 * When initializing the memmap, __init_single_page() sets the refcount
> > > > > -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > > > -	 * refcount of all involved pages to 0.
> > > > > +	 * When initializing the memmap, memmap_init_range sets the refcount
> > > > > +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > > > +	 * have to set the refcount of all involved pages to 0. Otherwise,
> > > > > +	 * we don't do it, as reserve_bootmem_region only set the refcount on
> > > > > +	 * reserve region ("reserved") in early context.
> > > > >    	 */
> > > > Again, why hotplug and early init should be different?
> > > I will add a comment that describes it will save boot time.
> > But why do we need initialize struct pages differently at boot time vs
> > memory hotplug?
> > Is there a reason memory hotplug cannot have page count set to 0 just like
> > for pages reserved at boot time?
> 
> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
> it can save time in
> 
> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
> keeping it in the same.

But it's not the same. It becomes slower after your patch and the code that
frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
for no apparent reason.

The if (page_count(page)) is really non-obvious...
 
> > > > > -	prefetchw(p);
> > > > > -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > > > -		prefetchw(p + 1);
> > > > > +	if (page_count(page)) {
> > > > > +		prefetchw(p);
> > > > > +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > > > +			prefetchw(p + 1);
> > > > > +			__ClearPageReserved(p);
> > > > > +			set_page_count(p, 0);
> > > > > +		}
> > > > >    		__ClearPageReserved(p);
> > > > >    		set_page_count(p, 0);
> > > > >    	}
> > > > > -	__ClearPageReserved(p);
> > > > > -	set_page_count(p, 0);
> > > > >    	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
> > > > > -- 
> > > > > 2.25.1
> > > > >
Yajun Deng Oct. 2, 2023, 7:03 a.m. UTC | #6
On 2023/10/2 02:59, Mike Rapoport wrote:
> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>> On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
>>>> On 2023/9/29 16:30, Mike Rapoport wrote:
>>>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>>>> memmap_init_range() would init page count of all pages, but the free
>>>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>>>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>>>>>> context.
>>>>>>
>>>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>>>>>> and check the page count before reset it.
>>>>>>
>>>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>>>> need, as it already done in __init_single_page.
>>>>>>
>>>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>>>
>>>>>> before:
>>>>>> free_low_memory_core_early()    341ms
>>>>>>
>>>>>> after:
>>>>>> free_low_memory_core_early()    285ms
>>>>>>
>>>>>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>>>>>> ---
>>>>>> v4: same with v2.
>>>>>> v3: same with v2.
>>>>>> v2: check page count instead of check context before reset it.
>>>>>> v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>>>> ---
>>>>>>     mm/mm_init.c    | 18 +++++++++++++-----
>>>>>>     mm/page_alloc.c | 20 ++++++++++++--------
>>>>>>     2 files changed, 25 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>>>> --- a/mm/mm_init.c
>>>>>> +++ b/mm/mm_init.c
>>>>>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>>>>>>     		if (zone_spans_pfn(zone, pfn))
>>>>>>     			break;
>>>>>>     	}
>>>>>> -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>>>>>> +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>>>     }
>>>>>>     #else
>>>>>>     static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>>>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>>>>>     			init_reserved_page(start_pfn, nid);
>>>>>> -			/* Avoid false-positive PageTail() */
>>>>>> -			INIT_LIST_HEAD(&page->lru);
>>>>>> +			/* Init page count for reserved region */
>>>>> Please add a comment that describes _why_ we initialize the page count here.
>>>> Okay.
>>>>>> +			init_page_count(page);
>>>>>>     			/*
>>>>>>     			 * no need for atomic set_bit because the struct
>>>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>>>>>>     		}
>>>>>>     		page = pfn_to_page(pfn);
>>>>>> -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>>>> -		if (context == MEMINIT_HOTPLUG)
>>>>>> +
>>>>>> +		/* If the context is MEMINIT_EARLY, we will init page count and
>>>>>> +		 * mark page reserved in reserve_bootmem_region, the free region
>>>>>> +		 * wouldn't have page count and we will check the pages count
>>>>>> +		 * in __free_pages_core.
>>>>>> +		 */
>>>>>> +		__init_single_page(page, pfn, zone, nid, 0);
>>>>>> +		if (context == MEMINIT_HOTPLUG) {
>>>>>> +			init_page_count(page);
>>>>>>     			__SetPageReserved(page);
>>>>> Rather than calling init_page_count() and __SetPageReserved() for
>>>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
>>>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>> No, the following code would cost more time than the current code in
>>>> memmap_init().
>>>>
>>>> if (context == MEMINIT_HOTPLUG)
>>>>
>>>> 	__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
>>>> else
>>>> 	
>>>> 	__init_single_page(page, pfn, zone, nid, 0);
>>> Sorry if I wasn't clear. What I meant was to have something along these lines:
>>>
>>> 	enum page_init_flags flags = 0;
>>>
>>> 	if (context == MEMINIT_HOTPLUG)
>>> 		flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
>>> 	__init_single_page(page, pfn, zone, nid, flags);
>>>
>> Okay, I'll test the time consumed in memmap_init().
>>>>> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
>>>>> @David, can you comment please?
>>>>>
>>>>>> +		}
>>>>>>     		/*
>>>>>>     		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>> --- a/mm/page_alloc.c
>>>>>> +++ b/mm/page_alloc.c
>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>>>     	unsigned int loop;
>>>>>>     	/*
>>>>>> -	 * When initializing the memmap, __init_single_page() sets the refcount
>>>>>> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>>> -	 * refcount of all involved pages to 0.
>>>>>> +	 * When initializing the memmap, memmap_init_range sets the refcount
>>>>>> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>>>> +	 * have to set the refcount of all involved pages to 0. Otherwise,
>>>>>> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
>>>>>> +	 * reserve region ("reserved") in early context.
>>>>>>     	 */
>>>>> Again, why hotplug and early init should be different?
>>>> I will add a comment that describes it will save boot time.
>>> But why do we need initialize struct pages differently at boot time vs
>>> memory hotplug?
>>> Is there a reason memory hotplug cannot have page count set to 0 just like
>>> for pages reserved at boot time?
>> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
>> it can save time in
>>
>> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
>> keeping it in the same.
> But it's not the same. It becomes slower after your patch and the code that
> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
> for no apparent reason.

__free_pages_core will also be called by others, such as:
deferred_free_range, do_collection and memblock_free_late.

We couldn't remove  'if (page_count(page))' even if we set page count to 
0 when MEMINIT_HOTPLUG.

The following data is the time consumed in online_pages_range().

start_pfn   nr_pages    before   after
0x680000    0x80000     4.315ms  4.461ms
0x700000    0x80000     4.468ms  4.440ms
0x780000    0x80000     4.295ms  3.835ms
0x800000    0x80000     3.928ms  4.377ms
0x880000    0x80000     4.321ms  4.378ms
0x900000    0x80000     4.448ms  3.964ms
0x980000    0x80000     4.000ms  4.381ms
0xa00000    0x80000     4.459ms  4.255ms
sum                     34.234ms  34.091ms

As we can see, it doesn't become slower with ' if (page_count(page))' 
when MEMINIT_HOTPLUG.

> The if (page_count(page)) is really non-obvious...
>   
>>>>>> -	prefetchw(p);
>>>>>> -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> -		prefetchw(p + 1);
>>>>>> +	if (page_count(page)) {
>>>>>> +		prefetchw(p);
>>>>>> +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> +			prefetchw(p + 1);
>>>>>> +			__ClearPageReserved(p);
>>>>>> +			set_page_count(p, 0);
>>>>>> +		}
>>>>>>     		__ClearPageReserved(p);
>>>>>>     		set_page_count(p, 0);
>>>>>>     	}
>>>>>> -	__ClearPageReserved(p);
>>>>>> -	set_page_count(p, 0);
>>>>>>     	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>>>>> -- 
>>>>>> 2.25.1
>>>>>>
David Hildenbrand Oct. 2, 2023, 8:30 a.m. UTC | #7
On 29.09.23 10:30, Mike Rapoport wrote:
> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>> memmap_init_range() would init page count of all pages, but the free
>> pages count would be reset in __free_pages_core(). There are opposite
>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>> context.
>>
>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>> and check the page count before reset it.
>>
>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>> need, as it already done in __init_single_page.
>>
>> The following data was tested on an x86 machine with 190GB of RAM.
>>
>> before:
>> free_low_memory_core_early()    341ms
>>
>> after:
>> free_low_memory_core_early()    285ms
>>
>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>> ---
>> v4: same with v2.
>> v3: same with v2.
>> v2: check page count instead of check context before reset it.
>> v1: https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>> ---
>>   mm/mm_init.c    | 18 +++++++++++++-----
>>   mm/page_alloc.c | 20 ++++++++++++--------
>>   2 files changed, 25 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 9716c8a7ade9..3ab8861e1ef3 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>>   		if (zone_spans_pfn(zone, pfn))
>>   			break;
>>   	}
>> -	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>> +	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>   }
>>   #else
>>   static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>   
>>   			init_reserved_page(start_pfn, nid);
>>   
>> -			/* Avoid false-positive PageTail() */
>> -			INIT_LIST_HEAD(&page->lru);
>> +			/* Init page count for reserved region */
> 
> Please add a comment that describes _why_ we initialize the page count here.
> 
>> +			init_page_count(page);
>>   
>>   			/*
>>   			 * no need for atomic set_bit because the struct
>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>>   		}
>>   
>>   		page = pfn_to_page(pfn);
>> -		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>> -		if (context == MEMINIT_HOTPLUG)
>> +
>> +		/* If the context is MEMINIT_EARLY, we will init page count and
>> +		 * mark page reserved in reserve_bootmem_region, the free region
>> +		 * wouldn't have page count and we will check the pages count
>> +		 * in __free_pages_core.
>> +		 */
>> +		__init_single_page(page, pfn, zone, nid, 0);
>> +		if (context == MEMINIT_HOTPLUG) {
>> +			init_page_count(page);
>>   			__SetPageReserved(page);
> 
> Rather than calling init_page_count() and __SetPageReserved() for
> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
> 
> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> @David, can you comment please?

There are a lot of details to that, and I'll share some I can briefly think of.

1) __SetPageReserved()

I tried removing that a while ago, but there was a blocker (IIRC something about
ZONE_DEVICE). I still have the patches at [1] and I could probably take a look
if that blocker still exists (I recall that something changed at some point, but
I never had the time to follow up).

But once we stop setting the pages reserved, we might run into issues with ...


2) init_page_count()

virtio-mem, XEN balloon and HV-balloon add memory blocks that can contain holes.
set_online_page_callback() is used to intercept memory onlining and to expose
only the pages that are not holes to the buddy: calling generic_online_page() on !hole.

Holes are PageReserved but with an initialized page count. Memory offlining will fail on
PageReserved pages -- has_unmovable_pages().


At least virtio-mem clears the PageReserved flag of holes when onlining memory,
and currently relies in the page count to be reasonable (so memory offlining can work).

static void virtio_mem_set_fake_offline(unsigned long pfn,
					unsigned long nr_pages, bool onlined)
{
	page_offline_begin();
	for (; nr_pages--; pfn++) {
		struct page *page = pfn_to_page(pfn);

		__SetPageOffline(page);
		if (!onlined) {
			SetPageDirty(page);
			/* FIXME: remove after cleanups */
			ClearPageReserved(page);
		}
	}
	page_offline_end();
}


For virtio-mem, we could initialize the page count there instead. The other PV drivers
might require a bit more thought.


[1] https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup

> 
>> +		}
>>   
>>   		/*
>>   		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 06be8821d833..b868caabe8dc 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>   	unsigned int loop;
>>   
>>   	/*
>> -	 * When initializing the memmap, __init_single_page() sets the refcount
>> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
>> -	 * refcount of all involved pages to 0.
>> +	 * When initializing the memmap, memmap_init_range sets the refcount
>> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
>> +	 * have to set the refcount of all involved pages to 0. Otherwise,
>> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
>> +	 * reserve region ("reserved") in early context.
>>   	 */
> 
> Again, why hotplug and early init should be different?
> 
>> -	prefetchw(p);
>> -	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> -		prefetchw(p + 1);
>> +	if (page_count(page)) {
>> +		prefetchw(p);
>> +		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> +			prefetchw(p + 1);
>> +			__ClearPageReserved(p);
>> +			set_page_count(p, 0);
>> +		}
>>   		__ClearPageReserved(p);
>>   		set_page_count(p, 0);

That looks wrong. if the page count would by pure luck be 0 already for hotplugged memory,
you wouldn't clear the reserved flag.

These changes make me a bit nervous.
Mike Rapoport Oct. 2, 2023, 8:47 a.m. UTC | #8
On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
> 
> On 2023/10/2 02:59, Mike Rapoport wrote:
> > On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
> > > On 2023/9/29 18:02, Mike Rapoport wrote:
> > > > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > > > index 06be8821d833..b868caabe8dc 100644
> > > > > > > --- a/mm/page_alloc.c
> > > > > > > +++ b/mm/page_alloc.c
> > > > > > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > > > > >     	unsigned int loop;
> > > > > > >     	/*
> > > > > > > -	 * When initializing the memmap, __init_single_page() sets the refcount
> > > > > > > -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > > > > > -	 * refcount of all involved pages to 0.
> > > > > > > +	 * When initializing the memmap, memmap_init_range sets the refcount
> > > > > > > +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > > > > > +	 * have to set the refcount of all involved pages to 0. Otherwise,
> > > > > > > +	 * we don't do it, as reserve_bootmem_region only set the refcount on
> > > > > > > +	 * reserve region ("reserved") in early context.
> > > > > > >     	 */
> > > > > > Again, why hotplug and early init should be different?
> > > > > I will add a comment that describes it will save boot time.
> > > > But why do we need initialize struct pages differently at boot time vs
> > > > memory hotplug?
> > > > Is there a reason memory hotplug cannot have page count set to 0 just like
> > > > for pages reserved at boot time?
> > > This patch just save boot time in MEMINIT_EARLY. If someone finds out that
> > > it can save time in
> > > 
> > > MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
> > > keeping it in the same.
> > But it's not the same. It becomes slower after your patch and the code that
> > frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
> > for no apparent reason.
> 
> __free_pages_core will also be called by others, such as:
> deferred_free_range, do_collection and memblock_free_late.
> 
> We couldn't remove  'if (page_count(page))' even if we set page count to 0
> when MEMINIT_HOTPLUG.

That 'if' breaks the invariant that __free_pages_core is always called for
pages with initialized page count. Adding it may lead to subtle bugs and
random memory corruption so we don't want to add it at the first place.
David Hildenbrand Oct. 2, 2023, 8:56 a.m. UTC | #9
On 02.10.23 10:47, Mike Rapoport wrote:
> On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>>
>> On 2023/10/2 02:59, Mike Rapoport wrote:
>>> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>>>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>>>> --- a/mm/page_alloc.c
>>>>>>>> +++ b/mm/page_alloc.c
>>>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>>>>>      	unsigned int loop;
>>>>>>>>      	/*
>>>>>>>> -	 * When initializing the memmap, __init_single_page() sets the refcount
>>>>>>>> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>>>>> -	 * refcount of all involved pages to 0.
>>>>>>>> +	 * When initializing the memmap, memmap_init_range sets the refcount
>>>>>>>> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>>>>>> +	 * have to set the refcount of all involved pages to 0. Otherwise,
>>>>>>>> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
>>>>>>>> +	 * reserve region ("reserved") in early context.
>>>>>>>>      	 */
>>>>>>> Again, why hotplug and early init should be different?
>>>>>> I will add a comment that describes it will save boot time.
>>>>> But why do we need initialize struct pages differently at boot time vs
>>>>> memory hotplug?
>>>>> Is there a reason memory hotplug cannot have page count set to 0 just like
>>>>> for pages reserved at boot time?
>>>> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
>>>> it can save time in
>>>>
>>>> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
>>>> keeping it in the same.
>>> But it's not the same. It becomes slower after your patch and the code that
>>> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
>>> for no apparent reason.
>>
>> __free_pages_core will also be called by others, such as:
>> deferred_free_range, do_collection and memblock_free_late.
>>
>> We couldn't remove  'if (page_count(page))' even if we set page count to 0
>> when MEMINIT_HOTPLUG.
> 
> That 'if' breaks the invariant that __free_pages_core is always called for
> pages with initialized page count. Adding it may lead to subtle bugs and
> random memory corruption so we don't want to add it at the first place.

As long as we have to special-case memory hotplug, we know that we are 
always coming via generic_online_page() in that case. We could either 
move some logic over there, or let __free_pages_core() know what it 
should do.
Mike Rapoport Oct. 2, 2023, 11:10 a.m. UTC | #10
On Mon, Oct 02, 2023 at 10:56:51AM +0200, David Hildenbrand wrote:
> On 02.10.23 10:47, Mike Rapoport wrote:
> > On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
> > > 
> > > On 2023/10/2 02:59, Mike Rapoport wrote:
> > > > On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
> > > > > On 2023/9/29 18:02, Mike Rapoport wrote:
> > > > > > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > > > > > index 06be8821d833..b868caabe8dc 100644
> > > > > > > > > --- a/mm/page_alloc.c
> > > > > > > > > +++ b/mm/page_alloc.c
> > > > > > > > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > > > > > > >      	unsigned int loop;
> > > > > > > > >      	/*
> > > > > > > > > -	 * When initializing the memmap, __init_single_page() sets the refcount
> > > > > > > > > -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > > > > > > > -	 * refcount of all involved pages to 0.
> > > > > > > > > +	 * When initializing the memmap, memmap_init_range sets the refcount
> > > > > > > > > +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > > > > > > > +	 * have to set the refcount of all involved pages to 0. Otherwise,
> > > > > > > > > +	 * we don't do it, as reserve_bootmem_region only set the refcount on
> > > > > > > > > +	 * reserve region ("reserved") in early context.
> > > > > > > > >      	 */
> > > > > > > > Again, why hotplug and early init should be different?
> > > > > > > I will add a comment that describes it will save boot time.
> > > > > > But why do we need initialize struct pages differently at boot time vs
> > > > > > memory hotplug?
> > > > > > Is there a reason memory hotplug cannot have page count set to 0 just like
> > > > > > for pages reserved at boot time?
> > > > > This patch just save boot time in MEMINIT_EARLY. If someone finds out that
> > > > > it can save time in
> > > > > 
> > > > > MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
> > > > > keeping it in the same.
> > > > But it's not the same. It becomes slower after your patch and the code that
> > > > frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
> > > > for no apparent reason.
> > > 
> > > __free_pages_core will also be called by others, such as:
> > > deferred_free_range, do_collection and memblock_free_late.
> > > 
> > > We couldn't remove  'if (page_count(page))' even if we set page count to 0
> > > when MEMINIT_HOTPLUG.
> > 
> > That 'if' breaks the invariant that __free_pages_core is always called for
> > pages with initialized page count. Adding it may lead to subtle bugs and
> > random memory corruption so we don't want to add it at the first place.
> 
> As long as we have to special-case memory hotplug, we know that we are
> always coming via generic_online_page() in that case. We could either move
> some logic over there, or let __free_pages_core() know what it should do.

Looks like the patch rather special cases MEMINIT_EARLY, although I didn't
check throughfully other code paths. 
Anyway, relying on page_count() to be correct in different ways for
different callers of __free_pages_core() does not sound right to me.
 
> -- 
> Cheers,
> 
> David / dhildenb
>
David Hildenbrand Oct. 2, 2023, 11:25 a.m. UTC | #11
On 02.10.23 13:10, Mike Rapoport wrote:
> On Mon, Oct 02, 2023 at 10:56:51AM +0200, David Hildenbrand wrote:
>> On 02.10.23 10:47, Mike Rapoport wrote:
>>> On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>>>>
>>>> On 2023/10/2 02:59, Mike Rapoport wrote:
>>>>> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>>>>>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>>>>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>>>>>> --- a/mm/page_alloc.c
>>>>>>>>>> +++ b/mm/page_alloc.c
>>>>>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>>>>>>>       	unsigned int loop;
>>>>>>>>>>       	/*
>>>>>>>>>> -	 * When initializing the memmap, __init_single_page() sets the refcount
>>>>>>>>>> -	 * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>>>>>>> -	 * refcount of all involved pages to 0.
>>>>>>>>>> +	 * When initializing the memmap, memmap_init_range sets the refcount
>>>>>>>>>> +	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>>>>>>>> +	 * have to set the refcount of all involved pages to 0. Otherwise,
>>>>>>>>>> +	 * we don't do it, as reserve_bootmem_region only set the refcount on
>>>>>>>>>> +	 * reserve region ("reserved") in early context.
>>>>>>>>>>       	 */
>>>>>>>>> Again, why hotplug and early init should be different?
>>>>>>>> I will add a comment that describes it will save boot time.
>>>>>>> But why do we need initialize struct pages differently at boot time vs
>>>>>>> memory hotplug?
>>>>>>> Is there a reason memory hotplug cannot have page count set to 0 just like
>>>>>>> for pages reserved at boot time?
>>>>>> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
>>>>>> it can save time in
>>>>>>
>>>>>> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
>>>>>> keeping it in the same.
>>>>> But it's not the same. It becomes slower after your patch and the code that
>>>>> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
>>>>> for no apparent reason.
>>>>
>>>> __free_pages_core will also be called by others, such as:
>>>> deferred_free_range, do_collection and memblock_free_late.
>>>>
>>>> We couldn't remove  'if (page_count(page))' even if we set page count to 0
>>>> when MEMINIT_HOTPLUG.
>>>
>>> That 'if' breaks the invariant that __free_pages_core is always called for
>>> pages with initialized page count. Adding it may lead to subtle bugs and
>>> random memory corruption so we don't want to add it at the first place.
>>
>> As long as we have to special-case memory hotplug, we know that we are
>> always coming via generic_online_page() in that case. We could either move
>> some logic over there, or let __free_pages_core() know what it should do.
> 
> Looks like the patch rather special cases MEMINIT_EARLY, although I didn't
> check throughfully other code paths.
> Anyway, relying on page_count() to be correct in different ways for
> different callers of __free_pages_core() does not sound right to me.

Absolutely agreed.
Yajun Deng Oct. 3, 2023, 2:38 p.m. UTC | #12
On 2023/10/2 19:25, David Hildenbrand wrote:
> On 02.10.23 13:10, Mike Rapoport wrote:
>> On Mon, Oct 02, 2023 at 10:56:51AM +0200, David Hildenbrand wrote:
>>> On 02.10.23 10:47, Mike Rapoport wrote:
>>>> On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>>>>>
>>>>> On 2023/10/2 02:59, Mike Rapoport wrote:
>>>>>> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>>>>>>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>>>>>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>>>>>>> --- a/mm/page_alloc.c
>>>>>>>>>>> +++ b/mm/page_alloc.c
>>>>>>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page 
>>>>>>>>>>> *page, unsigned int order)
>>>>>>>>>>>           unsigned int loop;
>>>>>>>>>>>           /*
>>>>>>>>>>> -     * When initializing the memmap, __init_single_page() 
>>>>>>>>>>> sets the refcount
>>>>>>>>>>> -     * of all pages to 1 ("allocated"/"not free"). We have 
>>>>>>>>>>> to set the
>>>>>>>>>>> -     * refcount of all involved pages to 0.
>>>>>>>>>>> +     * When initializing the memmap, memmap_init_range sets 
>>>>>>>>>>> the refcount
>>>>>>>>>>> +     * of all pages to 1 ("reserved" and "free") in hotplug 
>>>>>>>>>>> context. We
>>>>>>>>>>> +     * have to set the refcount of all involved pages to 0. 
>>>>>>>>>>> Otherwise,
>>>>>>>>>>> +     * we don't do it, as reserve_bootmem_region only set 
>>>>>>>>>>> the refcount on
>>>>>>>>>>> +     * reserve region ("reserved") in early context.
>>>>>>>>>>>            */
>>>>>>>>>> Again, why hotplug and early init should be different?
>>>>>>>>> I will add a comment that describes it will save boot time.
>>>>>>>> But why do we need initialize struct pages differently at boot 
>>>>>>>> time vs
>>>>>>>> memory hotplug?
>>>>>>>> Is there a reason memory hotplug cannot have page count set to 
>>>>>>>> 0 just like
>>>>>>>> for pages reserved at boot time?
>>>>>>> This patch just save boot time in MEMINIT_EARLY. If someone 
>>>>>>> finds out that
>>>>>>> it can save time in
>>>>>>>
>>>>>>> MEMINIT_HOTPLUG, I think it can be done in another patch later. 
>>>>>>> I just
>>>>>>> keeping it in the same.
>>>>>> But it's not the same. It becomes slower after your patch and the 
>>>>>> code that
>>>>>> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes 
>>>>>> non-uniform
>>>>>> for no apparent reason.
>>>>>
>>>>> __free_pages_core will also be called by others, such as:
>>>>> deferred_free_range, do_collection and memblock_free_late.
>>>>>
>>>>> We couldn't remove  'if (page_count(page))' even if we set page 
>>>>> count to 0
>>>>> when MEMINIT_HOTPLUG.
>>>>
>>>> That 'if' breaks the invariant that __free_pages_core is always 
>>>> called for
>>>> pages with initialized page count. Adding it may lead to subtle 
>>>> bugs and
>>>> random memory corruption so we don't want to add it at the first 
>>>> place.
>>>
>>> As long as we have to special-case memory hotplug, we know that we are
>>> always coming via generic_online_page() in that case. We could 
>>> either move
>>> some logic over there, or let __free_pages_core() know what it 
>>> should do.
>>
>> Looks like the patch rather special cases MEMINIT_EARLY, although I 
>> didn't
>> check throughfully other code paths.
>> Anyway, relying on page_count() to be correct in different ways for
>> different callers of __free_pages_core() does not sound right to me.
>
> Absolutely agreed.
>
I already sent v5  a few days ago. Comments, please...
Mike Rapoport Oct. 5, 2023, 5:06 a.m. UTC | #13
On Tue, Oct 03, 2023 at 10:38:09PM +0800, Yajun Deng wrote:
> 
> On 2023/10/2 19:25, David Hildenbrand wrote:
> > On 02.10.23 13:10, Mike Rapoport wrote:
> > > > > 
> > > > > That 'if' breaks the invariant that __free_pages_core is
> > > > > always called for
> > > > > pages with initialized page count. Adding it may lead to
> > > > > subtle bugs and
> > > > > random memory corruption so we don't want to add it at the
> > > > > first place.
> > > > 
> > > > As long as we have to special-case memory hotplug, we know that we are
> > > > always coming via generic_online_page() in that case. We could
> > > > either move
> > > > some logic over there, or let __free_pages_core() know what it
> > > > should do.
> > > 
> > > Looks like the patch rather special cases MEMINIT_EARLY, although I
> > > didn't
> > > check throughfully other code paths.
> > > Anyway, relying on page_count() to be correct in different ways for
> > > different callers of __free_pages_core() does not sound right to me.
> > 
> > Absolutely agreed.
> > 
> I already sent v5  a few days ago. Comments, please...

Does it address all the feedback from this thread?
Yajun Deng Oct. 5, 2023, 2:04 p.m. UTC | #14
On 2023/10/5 13:06, Mike Rapoport wrote:
> On Tue, Oct 03, 2023 at 10:38:09PM +0800, Yajun Deng wrote:
>> On 2023/10/2 19:25, David Hildenbrand wrote:
>>> On 02.10.23 13:10, Mike Rapoport wrote:
>>>>>> That 'if' breaks the invariant that __free_pages_core is
>>>>>> always called for
>>>>>> pages with initialized page count. Adding it may lead to
>>>>>> subtle bugs and
>>>>>> random memory corruption so we don't want to add it at the
>>>>>> first place.
>>>>> As long as we have to special-case memory hotplug, we know that we are
>>>>> always coming via generic_online_page() in that case. We could
>>>>> either move
>>>>> some logic over there, or let __free_pages_core() know what it
>>>>> should do.
>>>> Looks like the patch rather special cases MEMINIT_EARLY, although I
>>>> didn't
>>>> check throughfully other code paths.
>>>> Anyway, relying on page_count() to be correct in different ways for
>>>> different callers of __free_pages_core() does not sound right to me.
>>> Absolutely agreed.
>>>
>> I already sent v5  a few days ago. Comments, please...
> Does it address all the feedback from this thread?
>

Except hotplug. As far as I konw, we only clear page count in 
MEMINIT_EARLY and all tail pages in compound page.

So adding 'if (page_count(page))' will have no actual effect for other 
case. According to previous data, it didn't

become slower in hotplug.
Yajun Deng Oct. 8, 2023, 8:57 a.m. UTC | #15
On 2023/10/2 16:30, David Hildenbrand wrote:
> On 29.09.23 10:30, Mike Rapoport wrote:
>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>> memmap_init_range() would init page count of all pages, but the free
>>> pages count would be reset in __free_pages_core(). There are opposite
>>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>>> context.
>>>
>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY 
>>> context,
>>> and check the page count before reset it.
>>>
>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>> need, as it already done in __init_single_page.
>>>
>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>
>>> before:
>>> free_low_memory_core_early()    341ms
>>>
>>> after:
>>> free_low_memory_core_early()    285ms
>>>
>>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>>> ---
>>> v4: same with v2.
>>> v3: same with v2.
>>> v2: check page count instead of check context before reset it.
>>> v1: 
>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>> ---
>>>   mm/mm_init.c    | 18 +++++++++++++-----
>>>   mm/page_alloc.c | 20 ++++++++++++--------
>>>   2 files changed, 25 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>> --- a/mm/mm_init.c
>>> +++ b/mm/mm_init.c
>>> @@ -718,7 +718,7 @@ static void __meminit 
>>> init_reserved_page(unsigned long pfn, int nid)
>>>           if (zone_spans_pfn(zone, pfn))
>>>               break;
>>>       }
>>> -    __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 
>>> INIT_PAGE_COUNT);
>>> +    __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>   }
>>>   #else
>>>   static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>> @@ -756,8 +756,8 @@ void __meminit 
>>> reserve_bootmem_region(phys_addr_t start,
>>>                 init_reserved_page(start_pfn, nid);
>>>   -            /* Avoid false-positive PageTail() */
>>> -            INIT_LIST_HEAD(&page->lru);
>>> +            /* Init page count for reserved region */
>>
>> Please add a comment that describes _why_ we initialize the page 
>> count here.
>>
>>> +            init_page_count(page);
>>>                 /*
>>>                * no need for atomic set_bit because the struct
>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long 
>>> size, int nid, unsigned long zone
>>>           }
>>>             page = pfn_to_page(pfn);
>>> -        __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>> -        if (context == MEMINIT_HOTPLUG)
>>> +
>>> +        /* If the context is MEMINIT_EARLY, we will init page count 
>>> and
>>> +         * mark page reserved in reserve_bootmem_region, the free 
>>> region
>>> +         * wouldn't have page count and we will check the pages count
>>> +         * in __free_pages_core.
>>> +         */
>>> +        __init_single_page(page, pfn, zone, nid, 0);
>>> +        if (context == MEMINIT_HOTPLUG) {
>>> +            init_page_count(page);
>>>               __SetPageReserved(page);
>>
>> Rather than calling init_page_count() and __SetPageReserved() for
>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | 
>> INIT_PAGE_RESERVED
>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>
>> But more generally, I wonder if we have to differentiate HOTPLUG here 
>> at all.
>> @David, can you comment please?
>
> There are a lot of details to that, and I'll share some I can briefly 
> think of.
>
> 1) __SetPageReserved()
>
> I tried removing that a while ago, but there was a blocker (IIRC 
> something about
> ZONE_DEVICE). I still have the patches at [1] and I could probably 
> take a look
> if that blocker still exists (I recall that something changed at some 
> point, but
> I never had the time to follow up).
>
> But once we stop setting the pages reserved, we might run into issues 
> with ...
>
>
> 2) init_page_count()
>
> virtio-mem, XEN balloon and HV-balloon add memory blocks that can 
> contain holes.
> set_online_page_callback() is used to intercept memory onlining and to 
> expose
> only the pages that are not holes to the buddy: calling 
> generic_online_page() on !hole.
>
> Holes are PageReserved but with an initialized page count. Memory 
> offlining will fail on
> PageReserved pages -- has_unmovable_pages().
>
>
> At least virtio-mem clears the PageReserved flag of holes when 
> onlining memory,
> and currently relies in the page count to be reasonable (so memory 
> offlining can work).
>
> static void virtio_mem_set_fake_offline(unsigned long pfn,
>                     unsigned long nr_pages, bool onlined)
> {
>     page_offline_begin();
>     for (; nr_pages--; pfn++) {
>         struct page *page = pfn_to_page(pfn);
>
>         __SetPageOffline(page);
>         if (!onlined) {
>             SetPageDirty(page);
>             /* FIXME: remove after cleanups */
>             ClearPageReserved(page);
>         }
>     }
>     page_offline_end();
> }
>
>
> For virtio-mem, we could initialize the page count there instead. The 
> other PV drivers
> might require a bit more thought.
>
>
> [1] 
> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>
>>
>>> +        }
>>>             /*
>>>            * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 06be8821d833..b868caabe8dc 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, 
>>> unsigned int order)
>>>       unsigned int loop;
>>>         /*
>>> -     * When initializing the memmap, __init_single_page() sets the 
>>> refcount
>>> -     * of all pages to 1 ("allocated"/"not free"). We have to set the
>>> -     * refcount of all involved pages to 0.
>>> +     * When initializing the memmap, memmap_init_range sets the 
>>> refcount
>>> +     * of all pages to 1 ("reserved" and "free") in hotplug 
>>> context. We
>>> +     * have to set the refcount of all involved pages to 0. Otherwise,
>>> +     * we don't do it, as reserve_bootmem_region only set the 
>>> refcount on
>>> +     * reserve region ("reserved") in early context.
>>>        */
>>
>> Again, why hotplug and early init should be different?
>>
>>> -    prefetchw(p);
>>> -    for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>> -        prefetchw(p + 1);
>>> +    if (page_count(page)) {
>>> +        prefetchw(p);
>>> +        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>> +            prefetchw(p + 1);
>>> +            __ClearPageReserved(p);
>>> +            set_page_count(p, 0);
>>> +        }
>>>           __ClearPageReserved(p);
>>>           set_page_count(p, 0);
>
> That looks wrong. if the page count would by pure luck be 0 already 
> for hotplugged memory,
> you wouldn't clear the reserved flag.
>
> These changes make me a bit nervous.


Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I need 
to do something else?
Yajun Deng Oct. 10, 2023, 2:31 a.m. UTC | #16
On 2023/10/8 16:57, Yajun Deng wrote:
>
> On 2023/10/2 16:30, David Hildenbrand wrote:
>> On 29.09.23 10:30, Mike Rapoport wrote:
>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>> memmap_init_range() would init page count of all pages, but the free
>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>> operations. It's unnecessary and time-consuming when it's 
>>>> MEMINIT_EARLY
>>>> context.
>>>>
>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY 
>>>> context,
>>>> and check the page count before reset it.
>>>>
>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>> need, as it already done in __init_single_page.
>>>>
>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>
>>>> before:
>>>> free_low_memory_core_early()    341ms
>>>>
>>>> after:
>>>> free_low_memory_core_early()    285ms
>>>>
>>>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>>>> ---
>>>> v4: same with v2.
>>>> v3: same with v2.
>>>> v2: check page count instead of check context before reset it.
>>>> v1: 
>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>> ---
>>>>   mm/mm_init.c    | 18 +++++++++++++-----
>>>>   mm/page_alloc.c | 20 ++++++++++++--------
>>>>   2 files changed, 25 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>> --- a/mm/mm_init.c
>>>> +++ b/mm/mm_init.c
>>>> @@ -718,7 +718,7 @@ static void __meminit 
>>>> init_reserved_page(unsigned long pfn, int nid)
>>>>           if (zone_spans_pfn(zone, pfn))
>>>>               break;
>>>>       }
>>>> -    __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 
>>>> INIT_PAGE_COUNT);
>>>> +    __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>   }
>>>>   #else
>>>>   static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>> @@ -756,8 +756,8 @@ void __meminit 
>>>> reserve_bootmem_region(phys_addr_t start,
>>>>                 init_reserved_page(start_pfn, nid);
>>>>   -            /* Avoid false-positive PageTail() */
>>>> -            INIT_LIST_HEAD(&page->lru);
>>>> +            /* Init page count for reserved region */
>>>
>>> Please add a comment that describes _why_ we initialize the page 
>>> count here.
>>>
>>>> +            init_page_count(page);
>>>>                 /*
>>>>                * no need for atomic set_bit because the struct
>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long 
>>>> size, int nid, unsigned long zone
>>>>           }
>>>>             page = pfn_to_page(pfn);
>>>> -        __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>> -        if (context == MEMINIT_HOTPLUG)
>>>> +
>>>> +        /* If the context is MEMINIT_EARLY, we will init page 
>>>> count and
>>>> +         * mark page reserved in reserve_bootmem_region, the free 
>>>> region
>>>> +         * wouldn't have page count and we will check the pages count
>>>> +         * in __free_pages_core.
>>>> +         */
>>>> +        __init_single_page(page, pfn, zone, nid, 0);
>>>> +        if (context == MEMINIT_HOTPLUG) {
>>>> +            init_page_count(page);
>>>>               __SetPageReserved(page);
>>>
>>> Rather than calling init_page_count() and __SetPageReserved() for
>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | 
>>> INIT_PAGE_RESERVED
>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>
>>> But more generally, I wonder if we have to differentiate HOTPLUG 
>>> here at all.
>>> @David, can you comment please?
>>
>> There are a lot of details to that, and I'll share some I can briefly 
>> think of.
>>
>> 1) __SetPageReserved()
>>
>> I tried removing that a while ago, but there was a blocker (IIRC 
>> something about
>> ZONE_DEVICE). I still have the patches at [1] and I could probably 
>> take a look
>> if that blocker still exists (I recall that something changed at some 
>> point, but
>> I never had the time to follow up).
>>
>> But once we stop setting the pages reserved, we might run into issues 
>> with ...
>>
>>
>> 2) init_page_count()
>>
>> virtio-mem, XEN balloon and HV-balloon add memory blocks that can 
>> contain holes.
>> set_online_page_callback() is used to intercept memory onlining and 
>> to expose
>> only the pages that are not holes to the buddy: calling 
>> generic_online_page() on !hole.
>>
>> Holes are PageReserved but with an initialized page count. Memory 
>> offlining will fail on
>> PageReserved pages -- has_unmovable_pages().
>>
>>
>> At least virtio-mem clears the PageReserved flag of holes when 
>> onlining memory,
>> and currently relies in the page count to be reasonable (so memory 
>> offlining can work).
>>
>> static void virtio_mem_set_fake_offline(unsigned long pfn,
>>                     unsigned long nr_pages, bool onlined)
>> {
>>     page_offline_begin();
>>     for (; nr_pages--; pfn++) {
>>         struct page *page = pfn_to_page(pfn);
>>
>>         __SetPageOffline(page);
>>         if (!onlined) {
>>             SetPageDirty(page);
>>             /* FIXME: remove after cleanups */
>>             ClearPageReserved(page);
>>         }
>>     }
>>     page_offline_end();
>> }
>>
>>
>> For virtio-mem, we could initialize the page count there instead. The 
>> other PV drivers
>> might require a bit more thought.
>>
>>
>> [1] 
>> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>>
>>>
>>>> +        }
>>>>             /*
>>>>            * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 06be8821d833..b868caabe8dc 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, 
>>>> unsigned int order)
>>>>       unsigned int loop;
>>>>         /*
>>>> -     * When initializing the memmap, __init_single_page() sets the 
>>>> refcount
>>>> -     * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>> -     * refcount of all involved pages to 0.
>>>> +     * When initializing the memmap, memmap_init_range sets the 
>>>> refcount
>>>> +     * of all pages to 1 ("reserved" and "free") in hotplug 
>>>> context. We
>>>> +     * have to set the refcount of all involved pages to 0. 
>>>> Otherwise,
>>>> +     * we don't do it, as reserve_bootmem_region only set the 
>>>> refcount on
>>>> +     * reserve region ("reserved") in early context.
>>>>        */
>>>
>>> Again, why hotplug and early init should be different?
>>>
>>>> -    prefetchw(p);
>>>> -    for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> -        prefetchw(p + 1);
>>>> +    if (page_count(page)) {
>>>> +        prefetchw(p);
>>>> +        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> +            prefetchw(p + 1);
>>>> +            __ClearPageReserved(p);
>>>> +            set_page_count(p, 0);
>>>> +        }
>>>>           __ClearPageReserved(p);
>>>>           set_page_count(p, 0);
>>
>> That looks wrong. if the page count would by pure luck be 0 already 
>> for hotplugged memory,
>> you wouldn't clear the reserved flag.
>>
>> These changes make me a bit nervous.
>
>
> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I 
> need to do something else?
>

How about the following if statement? But it needs to add more patch 
like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).

It'll be safer, but more complex. Please comment...

	if (context != MEMINIT_EARLY || (page_count(page) || PageReserved(page)) {
Mike Rapoport Oct. 12, 2023, 9:19 a.m. UTC | #17
On Thu, Oct 05, 2023 at 10:04:28PM +0800, Yajun Deng wrote:
> 
> > > > > > > That 'if' breaks the invariant that __free_pages_core is
> > > > > > > always called for pages with initialized page count. Adding
> > > > > > > it may lead to subtle bugs and random memory corruption so we
> > > > > > > don't want to add it at the first place.
> > > > > >
> > > > > > As long as we have to special-case memory hotplug, we know that
> > > > > > we are always coming via generic_online_page() in that case. We
> > > > > > could either move some logic over there, or let
> > > > > > __free_pages_core() know what it should do.
> > > > >
> > > > > Looks like the patch rather special cases MEMINIT_EARLY, although
> > > > > I didn't check throughfully other code paths.  Anyway, relying on
> > > > > page_count() to be correct in different ways for different
> > > > > callers of __free_pages_core() does not sound right to me.
> > > >
> > > > Absolutely agreed.
> > > > 
> > > I already sent v5  a few days ago. Comments, please...
> >
> > Does it address all the feedback from this thread?
> 
> Except hotplug. 

Please reread carefully the last comments from me and from David above.
David Hildenbrand Oct. 12, 2023, 9:23 a.m. UTC | #18
On 10.10.23 04:31, Yajun Deng wrote:
> 
> On 2023/10/8 16:57, Yajun Deng wrote:
>>
>> On 2023/10/2 16:30, David Hildenbrand wrote:
>>> On 29.09.23 10:30, Mike Rapoport wrote:
>>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>>> memmap_init_range() would init page count of all pages, but the free
>>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>>> operations. It's unnecessary and time-consuming when it's
>>>>> MEMINIT_EARLY
>>>>> context.
>>>>>
>>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY
>>>>> context,
>>>>> and check the page count before reset it.
>>>>>
>>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>>> need, as it already done in __init_single_page.
>>>>>
>>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>>
>>>>> before:
>>>>> free_low_memory_core_early()    341ms
>>>>>
>>>>> after:
>>>>> free_low_memory_core_early()    285ms
>>>>>
>>>>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>>>>> ---
>>>>> v4: same with v2.
>>>>> v3: same with v2.
>>>>> v2: check page count instead of check context before reset it.
>>>>> v1:
>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>>> ---
>>>>>    mm/mm_init.c    | 18 +++++++++++++-----
>>>>>    mm/page_alloc.c | 20 ++++++++++++--------
>>>>>    2 files changed, 25 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>>> --- a/mm/mm_init.c
>>>>> +++ b/mm/mm_init.c
>>>>> @@ -718,7 +718,7 @@ static void __meminit
>>>>> init_reserved_page(unsigned long pfn, int nid)
>>>>>            if (zone_spans_pfn(zone, pfn))
>>>>>                break;
>>>>>        }
>>>>> -    __init_single_page(pfn_to_page(pfn), pfn, zid, nid,
>>>>> INIT_PAGE_COUNT);
>>>>> +    __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>>    }
>>>>>    #else
>>>>>    static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>>> @@ -756,8 +756,8 @@ void __meminit
>>>>> reserve_bootmem_region(phys_addr_t start,
>>>>>                  init_reserved_page(start_pfn, nid);
>>>>>    -            /* Avoid false-positive PageTail() */
>>>>> -            INIT_LIST_HEAD(&page->lru);
>>>>> +            /* Init page count for reserved region */
>>>>
>>>> Please add a comment that describes _why_ we initialize the page
>>>> count here.
>>>>
>>>>> +            init_page_count(page);
>>>>>                  /*
>>>>>                 * no need for atomic set_bit because the struct
>>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long
>>>>> size, int nid, unsigned long zone
>>>>>            }
>>>>>              page = pfn_to_page(pfn);
>>>>> -        __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>>> -        if (context == MEMINIT_HOTPLUG)
>>>>> +
>>>>> +        /* If the context is MEMINIT_EARLY, we will init page
>>>>> count and
>>>>> +         * mark page reserved in reserve_bootmem_region, the free
>>>>> region
>>>>> +         * wouldn't have page count and we will check the pages count
>>>>> +         * in __free_pages_core.
>>>>> +         */
>>>>> +        __init_single_page(page, pfn, zone, nid, 0);
>>>>> +        if (context == MEMINIT_HOTPLUG) {
>>>>> +            init_page_count(page);
>>>>>                __SetPageReserved(page);
>>>>
>>>> Rather than calling init_page_count() and __SetPageReserved() for
>>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT |
>>>> INIT_PAGE_RESERVED
>>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>>
>>>> But more generally, I wonder if we have to differentiate HOTPLUG
>>>> here at all.
>>>> @David, can you comment please?
>>>
>>> There are a lot of details to that, and I'll share some I can briefly
>>> think of.
>>>
>>> 1) __SetPageReserved()
>>>
>>> I tried removing that a while ago, but there was a blocker (IIRC
>>> something about
>>> ZONE_DEVICE). I still have the patches at [1] and I could probably
>>> take a look
>>> if that blocker still exists (I recall that something changed at some
>>> point, but
>>> I never had the time to follow up).
>>>
>>> But once we stop setting the pages reserved, we might run into issues
>>> with ...
>>>
>>>
>>> 2) init_page_count()
>>>
>>> virtio-mem, XEN balloon and HV-balloon add memory blocks that can
>>> contain holes.
>>> set_online_page_callback() is used to intercept memory onlining and
>>> to expose
>>> only the pages that are not holes to the buddy: calling
>>> generic_online_page() on !hole.
>>>
>>> Holes are PageReserved but with an initialized page count. Memory
>>> offlining will fail on
>>> PageReserved pages -- has_unmovable_pages().
>>>
>>>
>>> At least virtio-mem clears the PageReserved flag of holes when
>>> onlining memory,
>>> and currently relies in the page count to be reasonable (so memory
>>> offlining can work).
>>>
>>> static void virtio_mem_set_fake_offline(unsigned long pfn,
>>>                      unsigned long nr_pages, bool onlined)
>>> {
>>>      page_offline_begin();
>>>      for (; nr_pages--; pfn++) {
>>>          struct page *page = pfn_to_page(pfn);
>>>
>>>          __SetPageOffline(page);
>>>          if (!onlined) {
>>>              SetPageDirty(page);
>>>              /* FIXME: remove after cleanups */
>>>              ClearPageReserved(page);
>>>          }
>>>      }
>>>      page_offline_end();
>>> }
>>>
>>>
>>> For virtio-mem, we could initialize the page count there instead. The
>>> other PV drivers
>>> might require a bit more thought.
>>>
>>>
>>> [1]
>>> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>>>
>>>>
>>>>> +        }
>>>>>              /*
>>>>>             * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>> --- a/mm/page_alloc.c
>>>>> +++ b/mm/page_alloc.c
>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page,
>>>>> unsigned int order)
>>>>>        unsigned int loop;
>>>>>          /*
>>>>> -     * When initializing the memmap, __init_single_page() sets the
>>>>> refcount
>>>>> -     * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>> -     * refcount of all involved pages to 0.
>>>>> +     * When initializing the memmap, memmap_init_range sets the
>>>>> refcount
>>>>> +     * of all pages to 1 ("reserved" and "free") in hotplug
>>>>> context. We
>>>>> +     * have to set the refcount of all involved pages to 0.
>>>>> Otherwise,
>>>>> +     * we don't do it, as reserve_bootmem_region only set the
>>>>> refcount on
>>>>> +     * reserve region ("reserved") in early context.
>>>>>         */
>>>>
>>>> Again, why hotplug and early init should be different?
>>>>
>>>>> -    prefetchw(p);
>>>>> -    for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>> -        prefetchw(p + 1);
>>>>> +    if (page_count(page)) {
>>>>> +        prefetchw(p);
>>>>> +        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>> +            prefetchw(p + 1);
>>>>> +            __ClearPageReserved(p);
>>>>> +            set_page_count(p, 0);
>>>>> +        }
>>>>>            __ClearPageReserved(p);
>>>>>            set_page_count(p, 0);
>>>
>>> That looks wrong. if the page count would by pure luck be 0 already
>>> for hotplugged memory,
>>> you wouldn't clear the reserved flag.
>>>
>>> These changes make me a bit nervous.
>>
>>
>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>> need to do something else?
>>
> 
> How about the following if statement? But it needs to add more patch
> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
> 
> It'll be safer, but more complex. Please comment...
> 
> 	if (context != MEMINIT_EARLY || (page_count(page) || PageReserved(page)) {
> 

Ideally we could make initialization only depend on the context, and not 
check for count or the reserved flag.
Yajun Deng Oct. 12, 2023, 9:36 a.m. UTC | #19
On 2023/10/12 17:19, Mike Rapoport wrote:
> On Thu, Oct 05, 2023 at 10:04:28PM +0800, Yajun Deng wrote:
>>>>>>>> That 'if' breaks the invariant that __free_pages_core is
>>>>>>>> always called for pages with initialized page count. Adding
>>>>>>>> it may lead to subtle bugs and random memory corruption so we
>>>>>>>> don't want to add it at the first place.
>>>>>>> As long as we have to special-case memory hotplug, we know that
>>>>>>> we are always coming via generic_online_page() in that case. We
>>>>>>> could either move some logic over there, or let
>>>>>>> __free_pages_core() know what it should do.
>>>>>> Looks like the patch rather special cases MEMINIT_EARLY, although
>>>>>> I didn't check throughfully other code paths.  Anyway, relying on
>>>>>> page_count() to be correct in different ways for different
>>>>>> callers of __free_pages_core() does not sound right to me.
>>>>> Absolutely agreed.
>>>>>
>>>> I already sent v5  a few days ago. Comments, please...
>>> Does it address all the feedback from this thread?
>> Except hotplug.
> Please reread carefully the last comments from me and from David above.
>

I replied in another thread about that 'if' statement. David just 
replied to me, let's discuss in another thread.
Yajun Deng Oct. 12, 2023, 9:53 a.m. UTC | #20
On 2023/10/12 17:23, David Hildenbrand wrote:
> On 10.10.23 04:31, Yajun Deng wrote:
>>
>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>
>>> On 2023/10/2 16:30, David Hildenbrand wrote:
>>>> On 29.09.23 10:30, Mike Rapoport wrote:
>>>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>>>> memmap_init_range() would init page count of all pages, but the free
>>>>>> pages count would be reset in __free_pages_core(). There are 
>>>>>> opposite
>>>>>> operations. It's unnecessary and time-consuming when it's
>>>>>> MEMINIT_EARLY
>>>>>> context.
>>>>>>
>>>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY
>>>>>> context,
>>>>>> and check the page count before reset it.
>>>>>>
>>>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>>>> need, as it already done in __init_single_page.
>>>>>>
>>>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>>>
>>>>>> before:
>>>>>> free_low_memory_core_early()    341ms
>>>>>>
>>>>>> after:
>>>>>> free_low_memory_core_early()    285ms
>>>>>>
>>>>>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>>>>>> ---
>>>>>> v4: same with v2.
>>>>>> v3: same with v2.
>>>>>> v2: check page count instead of check context before reset it.
>>>>>> v1:
>>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/ 
>>>>>>
>>>>>> ---
>>>>>>    mm/mm_init.c    | 18 +++++++++++++-----
>>>>>>    mm/page_alloc.c | 20 ++++++++++++--------
>>>>>>    2 files changed, 25 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>>>> --- a/mm/mm_init.c
>>>>>> +++ b/mm/mm_init.c
>>>>>> @@ -718,7 +718,7 @@ static void __meminit
>>>>>> init_reserved_page(unsigned long pfn, int nid)
>>>>>>            if (zone_spans_pfn(zone, pfn))
>>>>>>                break;
>>>>>>        }
>>>>>> -    __init_single_page(pfn_to_page(pfn), pfn, zid, nid,
>>>>>> INIT_PAGE_COUNT);
>>>>>> +    __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>>>    }
>>>>>>    #else
>>>>>>    static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>>>> @@ -756,8 +756,8 @@ void __meminit
>>>>>> reserve_bootmem_region(phys_addr_t start,
>>>>>>                  init_reserved_page(start_pfn, nid);
>>>>>>    -            /* Avoid false-positive PageTail() */
>>>>>> -            INIT_LIST_HEAD(&page->lru);
>>>>>> +            /* Init page count for reserved region */
>>>>>
>>>>> Please add a comment that describes _why_ we initialize the page
>>>>> count here.
>>>>>
>>>>>> + init_page_count(page);
>>>>>>                  /*
>>>>>>                 * no need for atomic set_bit because the struct
>>>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long
>>>>>> size, int nid, unsigned long zone
>>>>>>            }
>>>>>>              page = pfn_to_page(pfn);
>>>>>> -        __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>>>> -        if (context == MEMINIT_HOTPLUG)
>>>>>> +
>>>>>> +        /* If the context is MEMINIT_EARLY, we will init page
>>>>>> count and
>>>>>> +         * mark page reserved in reserve_bootmem_region, the free
>>>>>> region
>>>>>> +         * wouldn't have page count and we will check the pages 
>>>>>> count
>>>>>> +         * in __free_pages_core.
>>>>>> +         */
>>>>>> +        __init_single_page(page, pfn, zone, nid, 0);
>>>>>> +        if (context == MEMINIT_HOTPLUG) {
>>>>>> +            init_page_count(page);
>>>>>>                __SetPageReserved(page);
>>>>>
>>>>> Rather than calling init_page_count() and __SetPageReserved() for
>>>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT |
>>>>> INIT_PAGE_RESERVED
>>>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>>>
>>>>> But more generally, I wonder if we have to differentiate HOTPLUG
>>>>> here at all.
>>>>> @David, can you comment please?
>>>>
>>>> There are a lot of details to that, and I'll share some I can briefly
>>>> think of.
>>>>
>>>> 1) __SetPageReserved()
>>>>
>>>> I tried removing that a while ago, but there was a blocker (IIRC
>>>> something about
>>>> ZONE_DEVICE). I still have the patches at [1] and I could probably
>>>> take a look
>>>> if that blocker still exists (I recall that something changed at some
>>>> point, but
>>>> I never had the time to follow up).
>>>>
>>>> But once we stop setting the pages reserved, we might run into issues
>>>> with ...
>>>>
>>>>
>>>> 2) init_page_count()
>>>>
>>>> virtio-mem, XEN balloon and HV-balloon add memory blocks that can
>>>> contain holes.
>>>> set_online_page_callback() is used to intercept memory onlining and
>>>> to expose
>>>> only the pages that are not holes to the buddy: calling
>>>> generic_online_page() on !hole.
>>>>
>>>> Holes are PageReserved but with an initialized page count. Memory
>>>> offlining will fail on
>>>> PageReserved pages -- has_unmovable_pages().
>>>>
>>>>
>>>> At least virtio-mem clears the PageReserved flag of holes when
>>>> onlining memory,
>>>> and currently relies in the page count to be reasonable (so memory
>>>> offlining can work).
>>>>
>>>> static void virtio_mem_set_fake_offline(unsigned long pfn,
>>>>                      unsigned long nr_pages, bool onlined)
>>>> {
>>>>      page_offline_begin();
>>>>      for (; nr_pages--; pfn++) {
>>>>          struct page *page = pfn_to_page(pfn);
>>>>
>>>>          __SetPageOffline(page);
>>>>          if (!onlined) {
>>>>              SetPageDirty(page);
>>>>              /* FIXME: remove after cleanups */
>>>>              ClearPageReserved(page);
>>>>          }
>>>>      }
>>>>      page_offline_end();
>>>> }
>>>>
>>>>
>>>> For virtio-mem, we could initialize the page count there instead. The
>>>> other PV drivers
>>>> might require a bit more thought.
>>>>
>>>>
>>>> [1]
>>>> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>>>>
>>>>>
>>>>>> +        }
>>>>>>              /*
>>>>>>             * Usually, we want to mark the pageblock 
>>>>>> MIGRATE_MOVABLE,
>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>> --- a/mm/page_alloc.c
>>>>>> +++ b/mm/page_alloc.c
>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page,
>>>>>> unsigned int order)
>>>>>>        unsigned int loop;
>>>>>>          /*
>>>>>> -     * When initializing the memmap, __init_single_page() sets the
>>>>>> refcount
>>>>>> -     * of all pages to 1 ("allocated"/"not free"). We have to 
>>>>>> set the
>>>>>> -     * refcount of all involved pages to 0.
>>>>>> +     * When initializing the memmap, memmap_init_range sets the
>>>>>> refcount
>>>>>> +     * of all pages to 1 ("reserved" and "free") in hotplug
>>>>>> context. We
>>>>>> +     * have to set the refcount of all involved pages to 0.
>>>>>> Otherwise,
>>>>>> +     * we don't do it, as reserve_bootmem_region only set the
>>>>>> refcount on
>>>>>> +     * reserve region ("reserved") in early context.
>>>>>>         */
>>>>>
>>>>> Again, why hotplug and early init should be different?
>>>>>
>>>>>> -    prefetchw(p);
>>>>>> -    for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> -        prefetchw(p + 1);
>>>>>> +    if (page_count(page)) {
>>>>>> +        prefetchw(p);
>>>>>> +        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> +            prefetchw(p + 1);
>>>>>> +            __ClearPageReserved(p);
>>>>>> +            set_page_count(p, 0);
>>>>>> +        }
>>>>>>            __ClearPageReserved(p);
>>>>>>            set_page_count(p, 0);
>>>>
>>>> That looks wrong. if the page count would by pure luck be 0 already
>>>> for hotplugged memory,
>>>> you wouldn't clear the reserved flag.
>>>>
>>>> These changes make me a bit nervous.
>>>
>>>
>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>> need to do something else?
>>>
>>
>> How about the following if statement? But it needs to add more patch
>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>
>> It'll be safer, but more complex. Please comment...
>>
>>     if (context != MEMINIT_EARLY || (page_count(page) || 
>> PageReserved(page)) {
>>
>
> Ideally we could make initialization only depend on the context, and 
> not check for count or the reserved flag.
>

This link is v1, 
https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/

If we could make initialization only depend on the context, I'll modify 
it based on v1.

@Mike,  By the way,  this code will cost more time:

                 if (context == MEMINIT_HOTPLUG)
                         flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
                 __init_single_page(page, pfn, zone, nid, flags);


[    0.014999] On node 0, zone DMA32: 31679 pages in unavailable ranges
[    0.311560] ACPI: PM-Timer IO Port: 0x508


This code will cost less time:

                 __init_single_page(page, pfn, zone, nid, 0);
                 if (context == MEMINIT_HOTPLUG) {
                         init_page_count(page);
                         __SetPageReserved(page);

[    0.014299] On node 0, zone DMA32: 31679 pages in unavailable ranges
[    0.250223] ACPI: PM-Timer IO Port: 0x508
Mike Rapoport Oct. 13, 2023, 8:48 a.m. UTC | #21
On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
> 
> On 2023/10/12 17:23, David Hildenbrand wrote:
> > On 10.10.23 04:31, Yajun Deng wrote:
> > > 
> > > On 2023/10/8 16:57, Yajun Deng wrote:
> > > > > 
> > > > > That looks wrong. if the page count would by pure luck be 0
> > > > > already for hotplugged memory, you wouldn't clear the reserved
> > > > > flag.
> > > > > 
> > > > > These changes make me a bit nervous.
> > > > 
> > > > Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
> > > > need to do something else?
> > > > 
> > > 
> > > How about the following if statement? But it needs to add more patch
> > > like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
> > > 
> > > It'll be safer, but more complex. Please comment...
> > > 
> > >     if (context != MEMINIT_EARLY || (page_count(page) ||
> > > PageReserved(page)) {
> > > 
> > 
> > Ideally we could make initialization only depend on the context, and not
> > check for count or the reserved flag.
> > 
> 
> This link is v1,
> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
> 
> If we could make initialization only depend on the context, I'll modify it
> based on v1.

Although ~20% improvement looks impressive, this is only optimization of a
fraction of the boot time, and realistically, how much 56 msec saves from
the total boot time when you boot a machine with 190G of RAM?

I still think the improvement does not justify the churn, added complexity
and special casing of different code paths of initialization of struct pages.
 
> @Mike,  By the way,  this code will cost more time:
> 
>                 if (context == MEMINIT_HOTPLUG)
>                         flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
>                 __init_single_page(page, pfn, zone, nid, flags);
> 
> 
> [    0.014999] On node 0, zone DMA32: 31679 pages in unavailable ranges
> [    0.311560] ACPI: PM-Timer IO Port: 0x508
> 
> 
> This code will cost less time:
> 
>                 __init_single_page(page, pfn, zone, nid, 0);
>                 if (context == MEMINIT_HOTPLUG) {
>                         init_page_count(page);
>                         __SetPageReserved(page);
> 
> [    0.014299] On node 0, zone DMA32: 31679 pages in unavailable ranges
> [    0.250223] ACPI: PM-Timer IO Port: 0x508
>
Yajun Deng Oct. 13, 2023, 9:29 a.m. UTC | #22
On 2023/10/13 16:48, Mike Rapoport wrote:
> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>> flag.
>>>>>>
>>>>>> These changes make me a bit nervous.
>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>>>> need to do something else?
>>>>>
>>>> How about the following if statement? But it needs to add more patch
>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>
>>>> It'll be safer, but more complex. Please comment...
>>>>
>>>>      if (context != MEMINIT_EARLY || (page_count(page) ||
>>>> PageReserved(page)) {
>>>>
>>> Ideally we could make initialization only depend on the context, and not
>>> check for count or the reserved flag.
>>>
>> This link is v1,
>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>
>> If we could make initialization only depend on the context, I'll modify it
>> based on v1.
> Although ~20% improvement looks impressive, this is only optimization of a
> fraction of the boot time, and realistically, how much 56 msec saves from
> the total boot time when you boot a machine with 190G of RAM?


There are a lot of factors that can affect the total boot time. 56 msec 
saves may be insignificant.

But if we look at the boot log, we'll see there's a significant time jump.

before:

[    0.250334] ACPI: PM-Timer IO Port: 0x508

[    0.618994] Memory: 173413056K/199884452K available (18440K kernel 
code, 4204K rwdata, 5608K rodata, 3188K init, 17024K bss, 5499616K 
reserved, 20971520K cma-reserved)


after:

[    0.260229] software IO TLB: area num 32.

[    0.563497] Memory: 173413056K/199884452K available (18440K kernel 
code, 4204K rwdata, 5608K rodata, 3188K init, 17024K bss, 5499616K 
reserved, 20971520K cma-reserved)


Memory initialization is time consuming in the boot log.

> I still think the improvement does not justify the churn, added complexity
> and special casing of different code paths of initialization of struct pages.
>   


Because there is a loop, if the order is MAX_ORDER, the loop will run 
1024 times. The following 'if' would be safer:

'if (context != MEMINIT_EARLY || (page_count(page) || >> 
PageReserved(page)) {'

This is a foundation. We may change this when we are able to safely 
remove page init in the hotplug context one day.

So the case for the early context is only temporary.

>> @Mike,  By the way,  this code will cost more time:
>>
>>                  if (context == MEMINIT_HOTPLUG)
>>                          flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
>>                  __init_single_page(page, pfn, zone, nid, flags);
>>
>>
>> [    0.014999] On node 0, zone DMA32: 31679 pages in unavailable ranges
>> [    0.311560] ACPI: PM-Timer IO Port: 0x508
>>
>>
>> This code will cost less time:
>>
>>                  __init_single_page(page, pfn, zone, nid, 0);
>>                  if (context == MEMINIT_HOTPLUG) {
>>                          init_page_count(page);
>>                          __SetPageReserved(page);
>>
>> [    0.014299] On node 0, zone DMA32: 31679 pages in unavailable ranges
>> [    0.250223] ACPI: PM-Timer IO Port: 0x508
>>
Mike Rapoport Oct. 16, 2023, 6:33 a.m. UTC | #23
On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
> 
> On 2023/10/13 16:48, Mike Rapoport wrote:
> > On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
> > > On 2023/10/12 17:23, David Hildenbrand wrote:
> > > > On 10.10.23 04:31, Yajun Deng wrote:
> > > > > On 2023/10/8 16:57, Yajun Deng wrote:
> > > > > > > That looks wrong. if the page count would by pure luck be 0
> > > > > > > already for hotplugged memory, you wouldn't clear the reserved
> > > > > > > flag.
> > > > > > > 
> > > > > > > These changes make me a bit nervous.
> > > > > > Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
> > > > > > need to do something else?
> > > > > > 
> > > > > How about the following if statement? But it needs to add more patch
> > > > > like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
> > > > > 
> > > > > It'll be safer, but more complex. Please comment...
> > > > > 
> > > > >      if (context != MEMINIT_EARLY || (page_count(page) ||
> > > > > PageReserved(page)) {
> > > > > 
> > > > Ideally we could make initialization only depend on the context, and not
> > > > check for count or the reserved flag.
> > > > 
> > > This link is v1,
> > > https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
> > > 
> > > If we could make initialization only depend on the context, I'll modify it
> > > based on v1.
> > Although ~20% improvement looks impressive, this is only optimization of a
> > fraction of the boot time, and realistically, how much 56 msec saves from
> > the total boot time when you boot a machine with 190G of RAM?
> 
> There are a lot of factors that can affect the total boot time. 56 msec
> saves may be insignificant.
> 
> But if we look at the boot log, we'll see there's a significant time jump.
> 
> before:
> 
> [    0.250334] ACPI: PM-Timer IO Port: 0x508
> [    0.618994] Memory: 173413056K/199884452K available (18440K kernel code,
> 
> after:
> 
> [    0.260229] software IO TLB: area num 32.
> [    0.563497] Memory: 173413056K/199884452K available (18440K kernel code,
> 
> Memory initialization is time consuming in the boot log.

You just confirmed that 56 msec is insignificant and then you send again
the improvement of ~60 msec in memory initialization.

What does this improvement gain in percentage of total boot time?
 
> > I still think the improvement does not justify the churn, added complexity
> > and special casing of different code paths of initialization of struct pages.
> 
> 
> Because there is a loop, if the order is MAX_ORDER, the loop will run 1024
> times. The following 'if' would be safer:
> 
> 'if (context != MEMINIT_EARLY || (page_count(page) || >> PageReserved(page))
> {'

No, it will not.

As the matter of fact any condition here won't be 'safer' because it makes
the code more complex and less maintainable. 
Any future change in __free_pages_core() or one of it's callers will have
to reason what will happen with that condition after the change.
Yajun Deng Oct. 16, 2023, 8:10 a.m. UTC | #24
On 2023/10/16 14:33, Mike Rapoport wrote:
> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>> flag.
>>>>>>>>
>>>>>>>> These changes make me a bit nervous.
>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>>>>>> need to do something else?
>>>>>>>
>>>>>> How about the following if statement? But it needs to add more patch
>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>
>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>
>>>>>>       if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>> PageReserved(page)) {
>>>>>>
>>>>> Ideally we could make initialization only depend on the context, and not
>>>>> check for count or the reserved flag.
>>>>>
>>>> This link is v1,
>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>>
>>>> If we could make initialization only depend on the context, I'll modify it
>>>> based on v1.
>>> Although ~20% improvement looks impressive, this is only optimization of a
>>> fraction of the boot time, and realistically, how much 56 msec saves from
>>> the total boot time when you boot a machine with 190G of RAM?
>> There are a lot of factors that can affect the total boot time. 56 msec
>> saves may be insignificant.
>>
>> But if we look at the boot log, we'll see there's a significant time jump.
>>
>> before:
>>
>> [    0.250334] ACPI: PM-Timer IO Port: 0x508
>> [    0.618994] Memory: 173413056K/199884452K available (18440K kernel code,
>>
>> after:
>>
>> [    0.260229] software IO TLB: area num 32.
>> [    0.563497] Memory: 173413056K/199884452K available (18440K kernel code,
>> Memory:
>> Memory initialization is time consuming in the boot log.
> You just confirmed that 56 msec is insignificant and then you send again
> the improvement of ~60 msec in memory initialization.
>
> What does this improvement gain in percentage of total boot time?


before:

[   10.692708] Run /init as init process


after:

[   10.666290] Run /init as init process


About 0.25%. The total boot time is variable, depending on how many 
drivers need to be initialized.


>   
>>> I still think the improvement does not justify the churn, added complexity
>>> and special casing of different code paths of initialization of struct pages.
>>
>> Because there is a loop, if the order is MAX_ORDER, the loop will run 1024
>> times. The following 'if' would be safer:
>>
>> 'if (context != MEMINIT_EARLY || (page_count(page) || >> PageReserved(page))
>> {'
> No, it will not.
>
> As the matter of fact any condition here won't be 'safer' because it makes
> the code more complex and less maintainable.
> Any future change in __free_pages_core() or one of it's callers will have
> to reason what will happen with that condition after the change.


To avoid introducing MEMINIT_LATE context and make code simpler. This 
might be a better option.

if (page_count(page) || PageReserved(page))
David Hildenbrand Oct. 16, 2023, 8:16 a.m. UTC | #25
On 16.10.23 10:10, Yajun Deng wrote:
> 
> On 2023/10/16 14:33, Mike Rapoport wrote:
>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>> flag.
>>>>>>>>>
>>>>>>>>> These changes make me a bit nervous.
>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>>>>>>> need to do something else?
>>>>>>>>
>>>>>>> How about the following if statement? But it needs to add more patch
>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>
>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>
>>>>>>>        if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>> PageReserved(page)) {
>>>>>>>
>>>>>> Ideally we could make initialization only depend on the context, and not
>>>>>> check for count or the reserved flag.
>>>>>>
>>>>> This link is v1,
>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>>>
>>>>> If we could make initialization only depend on the context, I'll modify it
>>>>> based on v1.
>>>> Although ~20% improvement looks impressive, this is only optimization of a
>>>> fraction of the boot time, and realistically, how much 56 msec saves from
>>>> the total boot time when you boot a machine with 190G of RAM?
>>> There are a lot of factors that can affect the total boot time. 56 msec
>>> saves may be insignificant.
>>>
>>> But if we look at the boot log, we'll see there's a significant time jump.
>>>
>>> before:
>>>
>>> [    0.250334] ACPI: PM-Timer IO Port: 0x508
>>> [    0.618994] Memory: 173413056K/199884452K available (18440K kernel code,
>>>
>>> after:
>>>
>>> [    0.260229] software IO TLB: area num 32.
>>> [    0.563497] Memory: 173413056K/199884452K available (18440K kernel code,
>>> Memory:
>>> Memory initialization is time consuming in the boot log.
>> You just confirmed that 56 msec is insignificant and then you send again
>> the improvement of ~60 msec in memory initialization.
>>
>> What does this improvement gain in percentage of total boot time?
> 
> 
> before:
> 
> [   10.692708] Run /init as init process
> 
> 
> after:
> 
> [   10.666290] Run /init as init process
> 
> 
> About 0.25%. The total boot time is variable, depending on how many
> drivers need to be initialized.
> 
> 
>>    
>>>> I still think the improvement does not justify the churn, added complexity
>>>> and special casing of different code paths of initialization of struct pages.
>>>
>>> Because there is a loop, if the order is MAX_ORDER, the loop will run 1024
>>> times. The following 'if' would be safer:
>>>
>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >> PageReserved(page))
>>> {'
>> No, it will not.
>>
>> As the matter of fact any condition here won't be 'safer' because it makes
>> the code more complex and less maintainable.
>> Any future change in __free_pages_core() or one of it's callers will have
>> to reason what will happen with that condition after the change.
> 
> 
> To avoid introducing MEMINIT_LATE context and make code simpler. This
> might be a better option.
> 
> if (page_count(page) || PageReserved(page))

I'll have to side with Mike here; this change might not be worth it.
Yajun Deng Oct. 16, 2023, 8:32 a.m. UTC | #26
On 2023/10/16 16:16, David Hildenbrand wrote:
> On 16.10.23 10:10, Yajun Deng wrote:
>>
>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>>> flag.
>>>>>>>>>>
>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or 
>>>>>>>>> do I
>>>>>>>>> need to do something else?
>>>>>>>>>
>>>>>>>> How about the following if statement? But it needs to add more 
>>>>>>>> patch
>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>
>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>
>>>>>>>>        if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>> PageReserved(page)) {
>>>>>>>>
>>>>>>> Ideally we could make initialization only depend on the context, 
>>>>>>> and not
>>>>>>> check for count or the reserved flag.
>>>>>>>
>>>>>> This link is v1,
>>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/ 
>>>>>>
>>>>>>
>>>>>> If we could make initialization only depend on the context, I'll 
>>>>>> modify it
>>>>>> based on v1.
>>>>> Although ~20% improvement looks impressive, this is only 
>>>>> optimization of a
>>>>> fraction of the boot time, and realistically, how much 56 msec 
>>>>> saves from
>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>> There are a lot of factors that can affect the total boot time. 56 
>>>> msec
>>>> saves may be insignificant.
>>>>
>>>> But if we look at the boot log, we'll see there's a significant 
>>>> time jump.
>>>>
>>>> before:
>>>>
>>>> [    0.250334] ACPI: PM-Timer IO Port: 0x508
>>>> [    0.618994] Memory: 173413056K/199884452K available (18440K 
>>>> kernel code,
>>>>
>>>> after:
>>>>
>>>> [    0.260229] software IO TLB: area num 32.
>>>> [    0.563497] Memory: 173413056K/199884452K available (18440K 
>>>> kernel code,
>>>> Memory:
>>>> Memory initialization is time consuming in the boot log.
>>> You just confirmed that 56 msec is insignificant and then you send 
>>> again
>>> the improvement of ~60 msec in memory initialization.
>>>
>>> What does this improvement gain in percentage of total boot time?
>>
>>
>> before:
>>
>> [   10.692708] Run /init as init process
>>
>>
>> after:
>>
>> [   10.666290] Run /init as init process
>>
>>
>> About 0.25%. The total boot time is variable, depending on how many
>> drivers need to be initialized.
>>
>>
>>>>> I still think the improvement does not justify the churn, added 
>>>>> complexity
>>>>> and special casing of different code paths of initialization of 
>>>>> struct pages.
>>>>
>>>> Because there is a loop, if the order is MAX_ORDER, the loop will 
>>>> run 1024
>>>> times. The following 'if' would be safer:
>>>>
>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >> 
>>>> PageReserved(page))
>>>> {'
>>> No, it will not.
>>>
>>> As the matter of fact any condition here won't be 'safer' because it 
>>> makes
>>> the code more complex and less maintainable.
>>> Any future change in __free_pages_core() or one of it's callers will 
>>> have
>>> to reason what will happen with that condition after the change.
>>
>>
>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>> might be a better option.
>>
>> if (page_count(page) || PageReserved(page))
>
> I'll have to side with Mike here; this change might not be worth it.
>

Okay, I got it. Thanks!
David Hildenbrand Oct. 16, 2023, 8:36 a.m. UTC | #27
On 16.10.23 10:32, Yajun Deng wrote:
> 
> On 2023/10/16 16:16, David Hildenbrand wrote:
>> On 16.10.23 10:10, Yajun Deng wrote:
>>>
>>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>>>> flag.
>>>>>>>>>>>
>>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>>> do I
>>>>>>>>>> need to do something else?
>>>>>>>>>>
>>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>>> patch
>>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>>
>>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>>
>>>>>>>>>         if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>>> PageReserved(page)) {
>>>>>>>>>
>>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>>> and not
>>>>>>>> check for count or the reserved flag.
>>>>>>>>
>>>>>>> This link is v1,
>>>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/
>>>>>>>
>>>>>>>
>>>>>>> If we could make initialization only depend on the context, I'll
>>>>>>> modify it
>>>>>>> based on v1.
>>>>>> Although ~20% improvement looks impressive, this is only
>>>>>> optimization of a
>>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>>> saves from
>>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>>> There are a lot of factors that can affect the total boot time. 56
>>>>> msec
>>>>> saves may be insignificant.
>>>>>
>>>>> But if we look at the boot log, we'll see there's a significant
>>>>> time jump.
>>>>>
>>>>> before:
>>>>>
>>>>> [    0.250334] ACPI: PM-Timer IO Port: 0x508
>>>>> [    0.618994] Memory: 173413056K/199884452K available (18440K
>>>>> kernel code,
>>>>>
>>>>> after:
>>>>>
>>>>> [    0.260229] software IO TLB: area num 32.
>>>>> [    0.563497] Memory: 173413056K/199884452K available (18440K
>>>>> kernel code,
>>>>> Memory:
>>>>> Memory initialization is time consuming in the boot log.
>>>> You just confirmed that 56 msec is insignificant and then you send
>>>> again
>>>> the improvement of ~60 msec in memory initialization.
>>>>
>>>> What does this improvement gain in percentage of total boot time?
>>>
>>>
>>> before:
>>>
>>> [   10.692708] Run /init as init process
>>>
>>>
>>> after:
>>>
>>> [   10.666290] Run /init as init process
>>>
>>>
>>> About 0.25%. The total boot time is variable, depending on how many
>>> drivers need to be initialized.
>>>
>>>
>>>>>> I still think the improvement does not justify the churn, added
>>>>>> complexity
>>>>>> and special casing of different code paths of initialization of
>>>>>> struct pages.
>>>>>
>>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>>> run 1024
>>>>> times. The following 'if' would be safer:
>>>>>
>>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>>> PageReserved(page))
>>>>> {'
>>>> No, it will not.
>>>>
>>>> As the matter of fact any condition here won't be 'safer' because it
>>>> makes
>>>> the code more complex and less maintainable.
>>>> Any future change in __free_pages_core() or one of it's callers will
>>>> have
>>>> to reason what will happen with that condition after the change.
>>>
>>>
>>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>>> might be a better option.
>>>
>>> if (page_count(page) || PageReserved(page))
>>
>> I'll have to side with Mike here; this change might not be worth it.
>>
> 
> Okay, I got it. Thanks!

IMHO instead of adding more checks to that code we should try to unify 
that handling such that we can just remove it. As expressed, at least 
from the memory hotplug perspective there are still reasons why we need 
that; I can provide some guidance on how to eventually achieve that, but 
it might end up in a bit of work ...

Anyhow, thanks for bringing up that topic; it reminded me that I still 
have pending cleanups to not rely on PageReserved on the memory hotplug 
path.
Yajun Deng Oct. 16, 2023, 10:17 a.m. UTC | #28
On 2023/10/16 16:36, David Hildenbrand wrote:
> On 16.10.23 10:32, Yajun Deng wrote:
>>
>> On 2023/10/16 16:16, David Hildenbrand wrote:
>>> On 16.10.23 10:10, Yajun Deng wrote:
>>>>
>>>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>>>>> flag.
>>>>>>>>>>>>
>>>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>>>> do I
>>>>>>>>>>> need to do something else?
>>>>>>>>>>>
>>>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>>>> patch
>>>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>>>
>>>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>>>
>>>>>>>>>>         if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>>>> PageReserved(page)) {
>>>>>>>>>>
>>>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>>>> and not
>>>>>>>>> check for count or the reserved flag.
>>>>>>>>>
>>>>>>>> This link is v1,
>>>>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/ 
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> If we could make initialization only depend on the context, I'll
>>>>>>>> modify it
>>>>>>>> based on v1.
>>>>>>> Although ~20% improvement looks impressive, this is only
>>>>>>> optimization of a
>>>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>>>> saves from
>>>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>>>> There are a lot of factors that can affect the total boot time. 56
>>>>>> msec
>>>>>> saves may be insignificant.
>>>>>>
>>>>>> But if we look at the boot log, we'll see there's a significant
>>>>>> time jump.
>>>>>>
>>>>>> before:
>>>>>>
>>>>>> [    0.250334] ACPI: PM-Timer IO Port: 0x508
>>>>>> [    0.618994] Memory: 173413056K/199884452K available (18440K
>>>>>> kernel code,
>>>>>>
>>>>>> after:
>>>>>>
>>>>>> [    0.260229] software IO TLB: area num 32.
>>>>>> [    0.563497] Memory: 173413056K/199884452K available (18440K
>>>>>> kernel code,
>>>>>> Memory:
>>>>>> Memory initialization is time consuming in the boot log.
>>>>> You just confirmed that 56 msec is insignificant and then you send
>>>>> again
>>>>> the improvement of ~60 msec in memory initialization.
>>>>>
>>>>> What does this improvement gain in percentage of total boot time?
>>>>
>>>>
>>>> before:
>>>>
>>>> [   10.692708] Run /init as init process
>>>>
>>>>
>>>> after:
>>>>
>>>> [   10.666290] Run /init as init process
>>>>
>>>>
>>>> About 0.25%. The total boot time is variable, depending on how many
>>>> drivers need to be initialized.
>>>>
>>>>
>>>>>>> I still think the improvement does not justify the churn, added
>>>>>>> complexity
>>>>>>> and special casing of different code paths of initialization of
>>>>>>> struct pages.
>>>>>>
>>>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>>>> run 1024
>>>>>> times. The following 'if' would be safer:
>>>>>>
>>>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>>>> PageReserved(page))
>>>>>> {'
>>>>> No, it will not.
>>>>>
>>>>> As the matter of fact any condition here won't be 'safer' because it
>>>>> makes
>>>>> the code more complex and less maintainable.
>>>>> Any future change in __free_pages_core() or one of it's callers will
>>>>> have
>>>>> to reason what will happen with that condition after the change.
>>>>
>>>>
>>>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>>>> might be a better option.
>>>>
>>>> if (page_count(page) || PageReserved(page))
>>>
>>> I'll have to side with Mike here; this change might not be worth it.
>>>
>>
>> Okay, I got it. Thanks!
>
> IMHO instead of adding more checks to that code we should try to unify 
> that handling such that we can just remove it. As expressed, at least 
> from the memory hotplug perspective there are still reasons why we 
> need that; I can provide some guidance on how to eventually achieve 
> that, but it might end up in a bit of work ...


Yes, we can't remove it right now. If we want to do that, we have to 
clean up rely on page count and PageReserved first.

>
> Anyhow, thanks for bringing up that topic; it reminded me that I still 
> have pending cleanups to not rely on PageReserved on the memory 
> hotplug path.
>
Yajun Deng Oct. 17, 2023, 9:58 a.m. UTC | #29
On 2023/10/16 18:17, Yajun Deng wrote:
>
> On 2023/10/16 16:36, David Hildenbrand wrote:
>> On 16.10.23 10:32, Yajun Deng wrote:
>>>
>>> On 2023/10/16 16:16, David Hildenbrand wrote:
>>>> On 16.10.23 10:10, Yajun Deng wrote:
>>>>>
>>>>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>>>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>>>>> already for hotplugged memory, you wouldn't clear the 
>>>>>>>>>>>>> reserved
>>>>>>>>>>>>> flag.
>>>>>>>>>>>>>
>>>>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>>>>> do I
>>>>>>>>>>>> need to do something else?
>>>>>>>>>>>>
>>>>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>>>>> patch
>>>>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>>>>
>>>>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>>>>
>>>>>>>>>>>         if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>>>>> PageReserved(page)) {
>>>>>>>>>>>
>>>>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>>>>> and not
>>>>>>>>>> check for count or the reserved flag.
>>>>>>>>>>
>>>>>>>>> This link is v1,
>>>>>>>>> https://lore.kernel.org/all/20230922070923.355656-1-yajun.deng@linux.dev/ 
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> If we could make initialization only depend on the context, I'll
>>>>>>>>> modify it
>>>>>>>>> based on v1.
>>>>>>>> Although ~20% improvement looks impressive, this is only
>>>>>>>> optimization of a
>>>>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>>>>> saves from
>>>>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>>>>> There are a lot of factors that can affect the total boot time. 56
>>>>>>> msec
>>>>>>> saves may be insignificant.
>>>>>>>
>>>>>>> But if we look at the boot log, we'll see there's a significant
>>>>>>> time jump.
>>>>>>>
>>>>>>> before:
>>>>>>>
>>>>>>> [    0.250334] ACPI: PM-Timer IO Port: 0x508
>>>>>>> [    0.618994] Memory: 173413056K/199884452K available (18440K
>>>>>>> kernel code,
>>>>>>>
>>>>>>> after:
>>>>>>>
>>>>>>> [    0.260229] software IO TLB: area num 32.
>>>>>>> [    0.563497] Memory: 173413056K/199884452K available (18440K
>>>>>>> kernel code,
>>>>>>> Memory:
>>>>>>> Memory initialization is time consuming in the boot log.
>>>>>> You just confirmed that 56 msec is insignificant and then you send
>>>>>> again
>>>>>> the improvement of ~60 msec in memory initialization.
>>>>>>
>>>>>> What does this improvement gain in percentage of total boot time?
>>>>>
>>>>>
>>>>> before:
>>>>>
>>>>> [   10.692708] Run /init as init process
>>>>>
>>>>>
>>>>> after:
>>>>>
>>>>> [   10.666290] Run /init as init process
>>>>>
>>>>>
>>>>> About 0.25%. The total boot time is variable, depending on how many
>>>>> drivers need to be initialized.
>>>>>
>>>>>
>>>>>>>> I still think the improvement does not justify the churn, added
>>>>>>>> complexity
>>>>>>>> and special casing of different code paths of initialization of
>>>>>>>> struct pages.
>>>>>>>
>>>>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>>>>> run 1024
>>>>>>> times. The following 'if' would be safer:
>>>>>>>
>>>>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>>>>> PageReserved(page))
>>>>>>> {'
>>>>>> No, it will not.
>>>>>>
>>>>>> As the matter of fact any condition here won't be 'safer' because it
>>>>>> makes
>>>>>> the code more complex and less maintainable.
>>>>>> Any future change in __free_pages_core() or one of it's callers will
>>>>>> have
>>>>>> to reason what will happen with that condition after the change.
>>>>>
>>>>>
>>>>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>>>>> might be a better option.
>>>>>
>>>>> if (page_count(page) || PageReserved(page))
>>>>
>>>> I'll have to side with Mike here; this change might not be worth it.
>>>>
>>>
>>> Okay, I got it. Thanks!
>>
>> IMHO instead of adding more checks to that code we should try to 
>> unify that handling such that we can just remove it. As expressed, at 
>> least from the memory hotplug perspective there are still reasons why 
>> we need that; I can provide some guidance on how to eventually 
>> achieve that, but it might end up in a bit of work ...
>
>
> Yes, we can't remove it right now. If we want to do that, we have to 
> clean up rely on page count and PageReserved first.


How about making __free_pages_core separate, like:

void __init __free_pages_core_early(struct page *page, unsigned int order)
{
         unsigned int nr_pages = 1 << order;

         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);

         if (page_contains_unaccepted(page, order)) {
                 if (order == MAX_ORDER && __free_unaccepted(page))
                         return;

                 accept_page(page, order);
         }

         /*
          * Bypass PCP and place fresh pages right to the tail, primarily
          * relevant for memory onlining.
          */
         __free_pages_ok(page, order, FPI_TO_TAIL);
}

void __free_pages_core(struct page *page, unsigned int order)
{
         unsigned int nr_pages = 1 << order;
         struct page *p = page;
         unsigned int loop;

         /*
          * When initializing the memmap, __init_single_page() sets the 
refcount
          * of all pages to 1 ("allocated"/"not free"). We have to set the
          * refcount of all involved pages to 0.
          */
         prefetchw(p);
         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
                 prefetchw(p + 1);
                 __ClearPageReserved(p);
                 set_page_count(p, 0);
         }
         __ClearPageReserved(p);
         set_page_count(p, 0);

         __free_pages_core_early(page, order);
}

We only change the caller we need to __free_pages_core_early, it doesn't 
affect other callers.

>
>>
>> Anyhow, thanks for bringing up that topic; it reminded me that I 
>> still have pending cleanups to not rely on PageReserved on the memory 
>> hotplug path.
>>
diff mbox series

Patch

diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9716c8a7ade9..3ab8861e1ef3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -718,7 +718,7 @@  static void __meminit init_reserved_page(unsigned long pfn, int nid)
 		if (zone_spans_pfn(zone, pfn))
 			break;
 	}
-	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
+	__init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
 }
 #else
 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
@@ -756,8 +756,8 @@  void __meminit reserve_bootmem_region(phys_addr_t start,
 
 			init_reserved_page(start_pfn, nid);
 
-			/* Avoid false-positive PageTail() */
-			INIT_LIST_HEAD(&page->lru);
+			/* Init page count for reserved region */
+			init_page_count(page);
 
 			/*
 			 * no need for atomic set_bit because the struct
@@ -888,9 +888,17 @@  void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
 		}
 
 		page = pfn_to_page(pfn);
-		__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
-		if (context == MEMINIT_HOTPLUG)
+
+		/* If the context is MEMINIT_EARLY, we will init page count and
+		 * mark page reserved in reserve_bootmem_region, the free region
+		 * wouldn't have page count and we will check the pages count
+		 * in __free_pages_core.
+		 */
+		__init_single_page(page, pfn, zone, nid, 0);
+		if (context == MEMINIT_HOTPLUG) {
+			init_page_count(page);
 			__SetPageReserved(page);
+		}
 
 		/*
 		 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 06be8821d833..b868caabe8dc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1285,18 +1285,22 @@  void __free_pages_core(struct page *page, unsigned int order)
 	unsigned int loop;
 
 	/*
-	 * When initializing the memmap, __init_single_page() sets the refcount
-	 * of all pages to 1 ("allocated"/"not free"). We have to set the
-	 * refcount of all involved pages to 0.
+	 * When initializing the memmap, memmap_init_range sets the refcount
+	 * of all pages to 1 ("reserved" and "free") in hotplug context. We
+	 * have to set the refcount of all involved pages to 0. Otherwise,
+	 * we don't do it, as reserve_bootmem_region only set the refcount on
+	 * reserve region ("reserved") in early context.
 	 */
-	prefetchw(p);
-	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
-		prefetchw(p + 1);
+	if (page_count(page)) {
+		prefetchw(p);
+		for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
+			prefetchw(p + 1);
+			__ClearPageReserved(p);
+			set_page_count(p, 0);
+		}
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
-	__ClearPageReserved(p);
-	set_page_count(p, 0);
 
 	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);