diff mbox series

[RFC] mm, page_alloc: avoid page_to_pfn() in move_freepages()

Message ID 20191127102800.51526-1-wangkefeng.wang@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] mm, page_alloc: avoid page_to_pfn() in move_freepages() | expand

Commit Message

Kefeng Wang Nov. 27, 2019, 10:28 a.m. UTC
The start_pfn and end_pfn are already available in move_freepages_block(),
pfn_valid_within() should validate pfn first before touching the page,
or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---

Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),

Unable to handle kernel NULL pointer dereference at virtual address 00000000
pgd = ffffff8008f7e000
[00000000] *pgd=0000000017ffe003, *pud=0000000017ffe003, *pmd=0000000000000000
Internal error: Oops: 96000007 [#1] SMP
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W  O    4.4.185 #1

PC is at move_freepages+0x80/0x10c
LR is at move_freepages_block+0xd4/0xf4
pc : [<ffffff80083332e8>] lr : [<ffffff8008333448>] pstate: 80000085
[...]
[<ffffff80083332e8>] move_freepages+0x80/0x10c
[<ffffff8008333448>] move_freepages_block+0xd4/0xf4
[<ffffff8008335414>] __rmqueue+0x2bc/0x44c
[<ffffff800833580c>] get_page_from_freelist+0x268/0x600
[<ffffff8008335e84>] __alloc_pages_nodemask+0x184/0x88c
[<ffffff800837fae8>] new_slab+0xd0/0x494
[<ffffff8008381834>] ___slab_alloc.constprop.29+0x1c8/0x2e8
[<ffffff80083819a8>] __slab_alloc.constprop.28+0x54/0x84
[<ffffff8008381e68>] kmem_cache_alloc+0x64/0x198
[<ffffff80085b04e0>] __build_skb+0x44/0xa4
[<ffffff80085b06e4>] __netdev_alloc_skb+0xe4/0x134

 mm/page_alloc.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

Comments

David Hildenbrand Nov. 27, 2019, 10:47 a.m. UTC | #1
[...]

>  static int move_freepages(struct zone *zone,
> -			  struct page *start_page, struct page *end_page,
> +			  unsigned long start_pfn, unsigned long end_pfn,
>  			  int migratetype, int *num_movable)
>  {
>  	struct page *page;
> +	unsigned long pfn;
>  	unsigned int order;
>  	int pages_moved = 0;
>  
> -	for (page = start_page; page <= end_page;) {
> -		if (!pfn_valid_within(page_to_pfn(page))) {
> -			page++;
> +	for (pfn = start_pfn; pfn <= end_pfn;) {
> +		if (!pfn_valid_within(pfn)) {
> +			pfn++;
>  			continue;
>  		}
>  
> +		page = pfn_to_page(pfn);
>  		if (!PageBuddy(page)) {
>  			/*
>  			 * We assume that pages that could be isolated for
> @@ -2268,8 +2270,7 @@ static int move_freepages(struct zone *zone,
>  			if (num_movable &&
>  					(PageLRU(page) || __PageMovable(page)))
>  				(*num_movable)++;
> -
> -			page++;
> +			pfn++;
>  			continue;
>  		}
>  
> @@ -2280,6 +2281,7 @@ static int move_freepages(struct zone *zone,
>  		order = page_order(page);
>  		move_to_free_area(page, &zone->free_area[order], migratetype);
>  		page += 1 << order;

You can drop this now as well, no?

> +		pfn += 1 << order;
>  		pages_moved += 1 << order;
>  	}
>  
> @@ -2289,25 +2291,22 @@ static int move_freepages(struct zone *zone,
>  int move_freepages_block(struct zone *zone, struct page *page,
>  				int migratetype, int *num_movable)
>  {
> -	unsigned long start_pfn, end_pfn;
> -	struct page *start_page, *end_page;
> +	unsigned long start_pfn, end_pfn, pfn;
>  
>  	if (num_movable)
>  		*num_movable = 0;
>  
> -	start_pfn = page_to_pfn(page);
> +	pfn = start_pfn = page_to_pfn(page);

pfn = page_to_pfn(page);

and ...

>  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);

...

start_pfn = pfn & ~(pageblock_nr_pages - 1);

instead?

> -	start_page = pfn_to_page(start_pfn);
> -	end_page = start_page + pageblock_nr_pages - 1;
>  	end_pfn = start_pfn + pageblock_nr_pages - 1;
>  
>  	/* Do not cross zone boundaries */
>  	if (!zone_spans_pfn(zone, start_pfn))
> -		start_page = page;
> +		start_pfn = pfn;
>  	if (!zone_spans_pfn(zone, end_pfn))
>  		return 0;
>  
> -	return move_freepages(zone, start_page, end_page, migratetype,
> +	return move_freepages(zone, start_pfn, end_pfn, migratetype,
>  								num_movable);
>  }
>  
>
Kefeng Wang Nov. 27, 2019, 11:21 a.m. UTC | #2
On 2019/11/27 18:47, David Hildenbrand wrote:
> [...]
> 
>>  static int move_freepages(struct zone *zone,
>> -			  struct page *start_page, struct page *end_page,
>> +			  unsigned long start_pfn, unsigned long end_pfn,
>>  			  int migratetype, int *num_movable)
>>  {
>>  	struct page *page;
>> +	unsigned long pfn;
>>  	unsigned int order;
>>  	int pages_moved = 0;
>>  
>> -	for (page = start_page; page <= end_page;) {
>> -		if (!pfn_valid_within(page_to_pfn(page))) {
>> -			page++;
>> +	for (pfn = start_pfn; pfn <= end_pfn;) {
>> +		if (!pfn_valid_within(pfn)) {
>> +			pfn++;
>>  			continue;
>>  		}
>>  
>> +		page = pfn_to_page(pfn);
>>  		if (!PageBuddy(page)) {
>>  			/*
>>  			 * We assume that pages that could be isolated for
>> @@ -2268,8 +2270,7 @@ static int move_freepages(struct zone *zone,
>>  			if (num_movable &&
>>  					(PageLRU(page) || __PageMovable(page)))
>>  				(*num_movable)++;
>> -
>> -			page++;
>> +			pfn++;
>>  			continue;
>>  		}
>>  
>> @@ -2280,6 +2281,7 @@ static int move_freepages(struct zone *zone,
>>  		order = page_order(page);
>>  		move_to_free_area(page, &zone->free_area[order], migratetype);
>>  		page += 1 << order;
> 
> You can drop this now as well, no?

should do it

> 
>> +		pfn += 1 << order;
>>  		pages_moved += 1 << order;
>>  	}
>>  
>> @@ -2289,25 +2291,22 @@ static int move_freepages(struct zone *zone,
>>  int move_freepages_block(struct zone *zone, struct page *page,
>>  				int migratetype, int *num_movable)
>>  {
>> -	unsigned long start_pfn, end_pfn;
>> -	struct page *start_page, *end_page;
>> +	unsigned long start_pfn, end_pfn, pfn;
>>  
>>  	if (num_movable)
>>  		*num_movable = 0;
>>  
>> -	start_pfn = page_to_pfn(page);
>> +	pfn = start_pfn = page_to_pfn(page);
> 
> pfn = page_to_pfn(page);
> 
> and ...
> 
>>  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
> 
> ...
> 
> start_pfn = pfn & ~(pageblock_nr_pages - 1);
> 
> instead?

will change, thanks for your comments.

> 
>> -	start_page = pfn_to_page(start_pfn);
>> -	end_page = start_page + pageblock_nr_pages - 1;
>>  	end_pfn = start_pfn + pageblock_nr_pages - 1;
>>  
>>  	/* Do not cross zone boundaries */
>>  	if (!zone_spans_pfn(zone, start_pfn))
>> -		start_page = page;
>> +		start_pfn = pfn;
>>  	if (!zone_spans_pfn(zone, end_pfn))
>>  		return 0;
>>  
>> -	return move_freepages(zone, start_page, end_page, migratetype,
>> +	return move_freepages(zone, start_pfn, end_pfn, migratetype,
>>  								num_movable);
>>  }
>>  
>>
> 
>
Michal Hocko Nov. 27, 2019, 11:47 a.m. UTC | #3
On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
> The start_pfn and end_pfn are already available in move_freepages_block(),
> pfn_valid_within() should validate pfn first before touching the page,
> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
> 
> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),

Is this reproducible with the current upstream kernel? There were large
changes in this aread since 4.4

Btw. the below should be part of the changelog.

> Unable to handle kernel NULL pointer dereference at virtual address 00000000
> pgd = ffffff8008f7e000
> [00000000] *pgd=0000000017ffe003, *pud=0000000017ffe003, *pmd=0000000000000000
> Internal error: Oops: 96000007 [#1] SMP
> CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W  O    4.4.185 #1
> 
> PC is at move_freepages+0x80/0x10c
> LR is at move_freepages_block+0xd4/0xf4
> pc : [<ffffff80083332e8>] lr : [<ffffff8008333448>] pstate: 80000085
> [...]
> [<ffffff80083332e8>] move_freepages+0x80/0x10c
> [<ffffff8008333448>] move_freepages_block+0xd4/0xf4
> [<ffffff8008335414>] __rmqueue+0x2bc/0x44c
> [<ffffff800833580c>] get_page_from_freelist+0x268/0x600
> [<ffffff8008335e84>] __alloc_pages_nodemask+0x184/0x88c
> [<ffffff800837fae8>] new_slab+0xd0/0x494
> [<ffffff8008381834>] ___slab_alloc.constprop.29+0x1c8/0x2e8
> [<ffffff80083819a8>] __slab_alloc.constprop.28+0x54/0x84
> [<ffffff8008381e68>] kmem_cache_alloc+0x64/0x198
> [<ffffff80085b04e0>] __build_skb+0x44/0xa4
> [<ffffff80085b06e4>] __netdev_alloc_skb+0xe4/0x134
> 
>  mm/page_alloc.c | 25 ++++++++++++-------------
>  1 file changed, 12 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f391c0c4ed1d..59f2c2b860fe 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2246,19 +2246,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
>   * boundary. If alignment is required, use move_freepages_block()
>   */
>  static int move_freepages(struct zone *zone,
> -			  struct page *start_page, struct page *end_page,
> +			  unsigned long start_pfn, unsigned long end_pfn,
>  			  int migratetype, int *num_movable)
>  {
>  	struct page *page;
> +	unsigned long pfn;
>  	unsigned int order;
>  	int pages_moved = 0;
>  
> -	for (page = start_page; page <= end_page;) {
> -		if (!pfn_valid_within(page_to_pfn(page))) {
> -			page++;
> +	for (pfn = start_pfn; pfn <= end_pfn;) {
> +		if (!pfn_valid_within(pfn)) {
> +			pfn++;
>  			continue;
>  		}
>  
> +		page = pfn_to_page(pfn);
>  		if (!PageBuddy(page)) {
>  			/*
>  			 * We assume that pages that could be isolated for
> @@ -2268,8 +2270,7 @@ static int move_freepages(struct zone *zone,
>  			if (num_movable &&
>  					(PageLRU(page) || __PageMovable(page)))
>  				(*num_movable)++;
> -
> -			page++;
> +			pfn++;
>  			continue;
>  		}
>  
> @@ -2280,6 +2281,7 @@ static int move_freepages(struct zone *zone,
>  		order = page_order(page);
>  		move_to_free_area(page, &zone->free_area[order], migratetype);
>  		page += 1 << order;
> +		pfn += 1 << order;
>  		pages_moved += 1 << order;
>  	}
>  
> @@ -2289,25 +2291,22 @@ static int move_freepages(struct zone *zone,
>  int move_freepages_block(struct zone *zone, struct page *page,
>  				int migratetype, int *num_movable)
>  {
> -	unsigned long start_pfn, end_pfn;
> -	struct page *start_page, *end_page;
> +	unsigned long start_pfn, end_pfn, pfn;
>  
>  	if (num_movable)
>  		*num_movable = 0;
>  
> -	start_pfn = page_to_pfn(page);
> +	pfn = start_pfn = page_to_pfn(page);
>  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
> -	start_page = pfn_to_page(start_pfn);
> -	end_page = start_page + pageblock_nr_pages - 1;
>  	end_pfn = start_pfn + pageblock_nr_pages - 1;
>  
>  	/* Do not cross zone boundaries */
>  	if (!zone_spans_pfn(zone, start_pfn))
> -		start_page = page;
> +		start_pfn = pfn;
>  	if (!zone_spans_pfn(zone, end_pfn))
>  		return 0;
>  
> -	return move_freepages(zone, start_page, end_page, migratetype,
> +	return move_freepages(zone, start_pfn, end_pfn, migratetype,
>  								num_movable);
>  }
>  
> -- 
> 2.20.1
>
Kefeng Wang Nov. 27, 2019, 1:13 p.m. UTC | #4
On 2019/11/27 19:47, Michal Hocko wrote:
> On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
>> The start_pfn and end_pfn are already available in move_freepages_block(),
>> pfn_valid_within() should validate pfn first before touching the page,
>> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Vlastimil Babka <vbabka@suse.cz>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> ---
>>
>> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),
> 
> Is this reproducible with the current upstream kernel? There were large
> changes in this aread since 4.4

Our inner tester found this oops twice, but couldn't be reproduced for now,
even in 4.4 kernel, still trying...

But the page_to_pfn() shouldn't be used in move_freepages(), right? ; )

> 
> Btw. the below should be part of the changelog.

Ok, will resend.

> 
>> Unable to handle kernel NULL pointer dereference at virtual address 00000000
>> pgd = ffffff8008f7e000
>> [00000000] *pgd=0000000017ffe003, *pud=0000000017ffe003, *pmd=0000000000000000
>> Internal error: Oops: 96000007 [#1] SMP
>> CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W  O    4.4.185 #1
>>
>> PC is at move_freepages+0x80/0x10c
>> LR is at move_freepages_block+0xd4/0xf4
>> pc : [<ffffff80083332e8>] lr : [<ffffff8008333448>] pstate: 80000085
>> [...]
>> [<ffffff80083332e8>] move_freepages+0x80/0x10c
>> [<ffffff8008333448>] move_freepages_block+0xd4/0xf4
>> [<ffffff8008335414>] __rmqueue+0x2bc/0x44c
>> [<ffffff800833580c>] get_page_from_freelist+0x268/0x600
>> [<ffffff8008335e84>] __alloc_pages_nodemask+0x184/0x88c
>> [<ffffff800837fae8>] new_slab+0xd0/0x494
>> [<ffffff8008381834>] ___slab_alloc.constprop.29+0x1c8/0x2e8
>> [<ffffff80083819a8>] __slab_alloc.constprop.28+0x54/0x84
>> [<ffffff8008381e68>] kmem_cache_alloc+0x64/0x198
>> [<ffffff80085b04e0>] __build_skb+0x44/0xa4
>> [<ffffff80085b06e4>] __netdev_alloc_skb+0xe4/0x134
>>
\
Michal Hocko Nov. 27, 2019, 2:13 p.m. UTC | #5
On Wed 27-11-19 21:13:00, Kefeng Wang wrote:
> 
> 
> On 2019/11/27 19:47, Michal Hocko wrote:
> > On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
> >> The start_pfn and end_pfn are already available in move_freepages_block(),
> >> pfn_valid_within() should validate pfn first before touching the page,
> >> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
> >>
> >> Cc: Andrew Morton <akpm@linux-foundation.org>
> >> Cc: Michal Hocko <mhocko@suse.com>
> >> Cc: Vlastimil Babka <vbabka@suse.cz>
> >> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> >> ---
> >>
> >> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),
> > 
> > Is this reproducible with the current upstream kernel? There were large
> > changes in this aread since 4.4
> 
> Our inner tester found this oops twice, but couldn't be reproduced for now,
> even in 4.4 kernel, still trying...
> 
> But the page_to_pfn() shouldn't be used in move_freepages(), right? ; )

Well, I do agree that going back and forth between page and pfn is ugly.
So this as a cleanup makes sense to me. But you are trying to fix a bug
and that bug should be explained. NULL ptr dereference sounds like a
memmap is not allocated for the particular pfn and this is a bit
unexpected even with holes, at least on x86, maybe arm64 allows that.
But the changelog should be clear about all this rather than paper over
a deeper problem potentially. Please also make sure to involve arm64
people.
Qian Cai Nov. 27, 2019, 2:28 p.m. UTC | #6
> On Nov 27, 2019, at 9:13 AM, Michal Hocko <mhocko@suse.com> wrote:
> 
> On Wed 27-11-19 21:13:00, Kefeng Wang wrote:
>> 
>> 
>> On 2019/11/27 19:47, Michal Hocko wrote:
>>> On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
>>>> The start_pfn and end_pfn are already available in move_freepages_block(),
>>>> pfn_valid_within() should validate pfn first before touching the page,
>>>> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
>>>> 
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>> Cc: Vlastimil Babka <vbabka@suse.cz>
>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>> ---
>>>> 
>>>> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),
>>> 
>>> Is this reproducible with the current upstream kernel? There were large
>>> changes in this aread since 4.4
>> 
>> Our inner tester found this oops twice, but couldn't be reproduced for now,
>> even in 4.4 kernel, still trying...
>> 
>> But the page_to_pfn() shouldn't be used in move_freepages(), right? ; )
> 
> Well, I do agree that going back and forth between page and pfn is ugly.
> So this as a cleanup makes sense to me. But you are trying to fix a bug
> and that bug should be explained. NULL ptr dereference sounds like a
> memmap is not allocated for the particular pfn and this is a bit
> unexpected even with holes, at least on x86, maybe arm64 allows that.
> But the changelog should be clear about all this rather than paper over
> a deeper problem potentially. Please also make sure to involve arm64
> people.

Indeed. Too many times people are only able to reproduce the issues on
old kernels but insist to forward-fix the mainline as well which only bring
unstable there.
Kefeng Wang Nov. 27, 2019, 2:39 p.m. UTC | #7
On 2019/11/27 22:13, Michal Hocko wrote:
> On Wed 27-11-19 21:13:00, Kefeng Wang wrote:
>>
>>
>> On 2019/11/27 19:47, Michal Hocko wrote:
>>> On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
>>>> The start_pfn and end_pfn are already available in move_freepages_block(),
>>>> pfn_valid_within() should validate pfn first before touching the page,
>>>> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
>>>>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>> Cc: Vlastimil Babka <vbabka@suse.cz>
>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>> ---
>>>>
>>>> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),
>>>
>>> Is this reproducible with the current upstream kernel? There were large
>>> changes in this aread since 4.4
>>
>> Our inner tester found this oops twice, but couldn't be reproduced for now,
>> even in 4.4 kernel, still trying...
>>
>> But the page_to_pfn() shouldn't be used in move_freepages(), right? ; )
> 
> Well, I do agree that going back and forth between page and pfn is ugly.
> So this as a cleanup makes sense to me. But you are trying to fix a bug
> and that bug should be explained. NULL ptr dereference sounds like a
> memmap is not allocated for the particular pfn and this is a bit
> unexpected even with holes, at least on x86, maybe arm64 allows that.
> But the changelog should be clear about all this rather than paper over
> a deeper problem potentially. Please also make sure to involve arm64
> people.

I'm still trying to reproduce it on 4.4 and 5.4, add Catalin, Will Mark,
could you give some advice on it, thanks.

https://lore.kernel.org/linux-mm/54064878-ea85-247a-3382-b96ddf97c667@huawei.com/T/#m87c545730a0a00c45e042937593c59f6552d1246

note:
We backport numa patches into 4.4, so the CONFIG_HOLES_IN_ZONE is enabled.

# CONFIG_NUMA is not set
CONFIG_HOLES_IN_ZONE=y

CONFIG_SPARSEMEM_MANUAL=y
CONFIG_SPARSEMEM=y
CONFIG_HAVE_MEMORY_PRESENT=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER=y
# CONFIG_SPARSEMEM_VMEMMAP is not set


>
Qian Cai Nov. 27, 2019, 3:09 p.m. UTC | #8
> On Nov 27, 2019, at 9:39 AM, Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
> 
> 
> 
> On 2019/11/27 22:13, Michal Hocko wrote:
>> On Wed 27-11-19 21:13:00, Kefeng Wang wrote:
>>> 
>>> 
>>> On 2019/11/27 19:47, Michal Hocko wrote:
>>>> On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
>>>>> The start_pfn and end_pfn are already available in move_freepages_block(),
>>>>> pfn_valid_within() should validate pfn first before touching the page,
>>>>> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
>>>>> 
>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>>> Cc: Vlastimil Babka <vbabka@suse.cz>
>>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>>> ---
>>>>> 
>>>>> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),
>>>> 
>>>> Is this reproducible with the current upstream kernel? There were large
>>>> changes in this aread since 4.4
>>> 
>>> Our inner tester found this oops twice, but couldn't be reproduced for now,
>>> even in 4.4 kernel, still trying...
>>> 
>>> But the page_to_pfn() shouldn't be used in move_freepages(), right? ; )
>> 
>> Well, I do agree that going back and forth between page and pfn is ugly.
>> So this as a cleanup makes sense to me. But you are trying to fix a bug
>> and that bug should be explained. NULL ptr dereference sounds like a
>> memmap is not allocated for the particular pfn and this is a bit
>> unexpected even with holes, at least on x86, maybe arm64 allows that.
>> But the changelog should be clear about all this rather than paper over
>> a deeper problem potentially. Please also make sure to involve arm64
>> people.
> 
> I'm still trying to reproduce it on 4.4 and 5.4, add Catalin, Will Mark,
> could you give some advice on it, thanks.
> 
> https://lore.kernel.org/linux-mm/54064878-ea85-247a-3382-b96ddf97c667@huawei.com/T/#m87c545730a0a00c45e042937593c59f6552d1246
> 
> note:
> We backport numa patches into 4.4, so the CONFIG_HOLES_IN_ZONE is enabled.
> 
> # CONFIG_NUMA is not set
> CONFIG_HOLES_IN_ZONE=y
> 
> CONFIG_SPARSEMEM_MANUAL=y
> CONFIG_SPARSEMEM=y
> CONFIG_HAVE_MEMORY_PRESENT=y
> CONFIG_SPARSEMEM_EXTREME=y
> CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
> CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER=y
> # CONFIG_SPARSEMEM_VMEMMAP is not set

See the commit b13bc35193d9 (“mm/hotplug: invalid PFNs from pfn_to_online_page()”)
where it lists a lot of code churn in that space that might indicate there are many commits
missing in your kernel.
David Hildenbrand Nov. 27, 2019, 5:15 p.m. UTC | #9
On 27.11.19 15:13, Michal Hocko wrote:
> On Wed 27-11-19 21:13:00, Kefeng Wang wrote:
>>
>>
>> On 2019/11/27 19:47, Michal Hocko wrote:
>>> On Wed 27-11-19 18:28:00, Kefeng Wang wrote:
>>>> The start_pfn and end_pfn are already available in move_freepages_block(),
>>>> pfn_valid_within() should validate pfn first before touching the page,
>>>> or we might access an unitialized page with CONFIG_HOLES_IN_ZONE configs.
>>>>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>> Cc: Vlastimil Babka <vbabka@suse.cz>
>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>> ---
>>>>
>>>> Here is an oops in 4.4(arm64 enabled CONFIG_HOLES_IN_ZONE),
>>>
>>> Is this reproducible with the current upstream kernel? There were large
>>> changes in this aread since 4.4
>>
>> Our inner tester found this oops twice, but couldn't be reproduced for now,
>> even in 4.4 kernel, still trying...
>>
>> But the page_to_pfn() shouldn't be used in move_freepages(), right? ; )
> 
> Well, I do agree that going back and forth between page and pfn is ugly.
> So this as a cleanup makes sense to me. But you are trying to fix a bug
> and that bug should be explained. NULL ptr dereference sounds like a
> memmap is not allocated for the particular pfn and this is a bit
> unexpected even with holes, at least on x86, maybe arm64 allows that.

AFAIK ARM allows that. (and arm64)

It's basically CONFIG_HAVE_ARCH_PFN_VALID (and CONFIG_HOLES_IN_ZONE
if I am not wrong)

commit eb33575cf67d3f35fa2510210ef92631266e2465
Author: Mel Gorman <mel@csn.ul.ie>
Date:   Wed May 13 17:34:48 2009 +0100

    [ARM] Double check memmap is actually valid with a memmap has unexpected holes V2
    
    pfn_valid() is meant to be able to tell if a given PFN has valid memmap
    associated with it or not. In FLATMEM, it is expected that holes always
    have valid memmap as long as there is valid PFNs either side of the hole.
    In SPARSEMEM, it is assumed that a valid section has a memmap for the
    entire section.
    
    However, ARM and maybe other embedded architectures in the future free
    memmap backing holes to save memory on the assumption the memmap is never
    used. The page_zone linkages are then broken even though pfn_valid()
    returns true. A walker of the full memmap must then do this additional
    check to ensure the memmap they are looking at is sane by making sure the
    zone and PFN linkages are still valid. This is expensive, but walkers of
    the full memmap are extremely rare.
    [...]

And

commit 7b7bf499f79de3f6c85a340c8453a78789523f85
Author: Will Deacon <will.deacon@arm.com>
Date:   Thu May 19 13:21:14 2011 +0100

    ARM: 6913/1: sparsemem: allow pfn_valid to be overridden when using SPARSEMEM


Side note: I find overriding pfn_valid() extremely ugly ...
           ... and CONFIG_HOLES_IN_ZONE as well.

> But the changelog should be clear about all this rather than paper over
> a deeper problem potentially. Please also make sure to involve arm64
> people.
>
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f391c0c4ed1d..59f2c2b860fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2246,19 +2246,21 @@  static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * boundary. If alignment is required, use move_freepages_block()
  */
 static int move_freepages(struct zone *zone,
-			  struct page *start_page, struct page *end_page,
+			  unsigned long start_pfn, unsigned long end_pfn,
 			  int migratetype, int *num_movable)
 {
 	struct page *page;
+	unsigned long pfn;
 	unsigned int order;
 	int pages_moved = 0;
 
-	for (page = start_page; page <= end_page;) {
-		if (!pfn_valid_within(page_to_pfn(page))) {
-			page++;
+	for (pfn = start_pfn; pfn <= end_pfn;) {
+		if (!pfn_valid_within(pfn)) {
+			pfn++;
 			continue;
 		}
 
+		page = pfn_to_page(pfn);
 		if (!PageBuddy(page)) {
 			/*
 			 * We assume that pages that could be isolated for
@@ -2268,8 +2270,7 @@  static int move_freepages(struct zone *zone,
 			if (num_movable &&
 					(PageLRU(page) || __PageMovable(page)))
 				(*num_movable)++;
-
-			page++;
+			pfn++;
 			continue;
 		}
 
@@ -2280,6 +2281,7 @@  static int move_freepages(struct zone *zone,
 		order = page_order(page);
 		move_to_free_area(page, &zone->free_area[order], migratetype);
 		page += 1 << order;
+		pfn += 1 << order;
 		pages_moved += 1 << order;
 	}
 
@@ -2289,25 +2291,22 @@  static int move_freepages(struct zone *zone,
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable)
 {
-	unsigned long start_pfn, end_pfn;
-	struct page *start_page, *end_page;
+	unsigned long start_pfn, end_pfn, pfn;
 
 	if (num_movable)
 		*num_movable = 0;
 
-	start_pfn = page_to_pfn(page);
+	pfn = start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
-	start_page = pfn_to_page(start_pfn);
-	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
-		start_page = page;
+		start_pfn = pfn;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 
-	return move_freepages(zone, start_page, end_page, migratetype,
+	return move_freepages(zone, start_pfn, end_pfn, migratetype,
 								num_movable);
 }