diff mbox series

mm: Skip the reserved bootmem for compaction

Message ID 20240902122445.11805-1-rongqianfeng@vivo.com (mailing list archive)
State New
Headers show
Series mm: Skip the reserved bootmem for compaction | expand

Commit Message

Rong Qianfeng Sept. 2, 2024, 12:24 p.m. UTC
Reserved pages are basically non-lru pages. This kind of memory can't be
used as migration sources and targets, skip it can bring some performance
benefits.

Because some drivers may also use PG_reserved, we just set PB_migrate_skip
for those clustered reserved bootmem during memory initialization.

Signed-off-by: Rong Qianfeng <rongqianfeng@vivo.com>
---
 include/linux/pageblock-flags.h | 13 +++++++++++
 mm/compaction.c                 | 40 +++++++++++++++++++++++++++++++++
 mm/mm_init.c                    | 14 ++++++++++++
 mm/page_alloc.c                 |  7 ++++++
 4 files changed, 74 insertions(+)

Comments

David Hildenbrand Sept. 2, 2024, 1:45 p.m. UTC | #1
On 02.09.24 14:24, Rong Qianfeng wrote:
> Reserved pages are basically non-lru pages. This kind of memory can't be
> used as migration sources and targets, skip it can bring some performance
> benefits.

Any numbers? :)

> 
> Because some drivers may also use PG_reserved, we just set PB_migrate_skip
> for those clustered reserved bootmem during memory initialization.
> 
> Signed-off-by: Rong Qianfeng <rongqianfeng@vivo.com>
> ---
>   include/linux/pageblock-flags.h | 13 +++++++++++
>   mm/compaction.c                 | 40 +++++++++++++++++++++++++++++++++
>   mm/mm_init.c                    | 14 ++++++++++++
>   mm/page_alloc.c                 |  7 ++++++
>   4 files changed, 74 insertions(+)
> 
> diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
> index fc6b9c87cb0a..63c5b0c69c1a 100644
> --- a/include/linux/pageblock-flags.h
> +++ b/include/linux/pageblock-flags.h
> @@ -86,6 +86,11 @@ void set_pfnblock_flags_mask(struct page *page,
>   	set_pfnblock_flags_mask(page, (1 << PB_migrate_skip),	\
>   			page_to_pfn(page),			\
>   			(1 << PB_migrate_skip))
> +
> +extern void set_pageblock_skip_range(unsigned long start_pfn,
> +				     unsigned long end_pfn);

two tabs indentation on the second line please. Applies to all others as 
well.

> +extern void clear_pageblock_skip_range(unsigned long start_pfn,
> +				       unsigned long end_pfn);
>   #else
>   static inline bool get_pageblock_skip(struct page *page)
>   {
> @@ -97,6 +102,14 @@ static inline void clear_pageblock_skip(struct page *page)
>   static inline void set_pageblock_skip(struct page *page)
>   {
>   }
> +static inline void set_pageblock_skip_range(unsigned long start_pfn,
> +					    unsigned long end_pfn)
> +{
> +}
> +static inline void clear_pageblock_skip_range(unsigned long start_pfn,
> +					      unsigned long end_pfn)
> +{
> +}

[...]

>   /*
>    * Compound pages of >= pageblock_order should consistently be skipped until
>    * released. It is always pointless to compact pages of such order (if they are
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 4ba5607aaf19..8b7dc8e00bf1 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -768,6 +768,13 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>   			__SetPageReserved(page);
>   		}
>   	}
> +
> +	/*
> +	 * Set PB_migrate_skip for reserved region. for cma memory
> +	 * and the memory released by free_reserved_area(), we will
> +	 * clear PB_migrate_skip when they are initialized.
> +	 */
> +	set_pageblock_skip_range(start_pfn, end_pfn);
>   }
>   
>   /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
> @@ -2236,6 +2243,13 @@ void __init init_cma_reserved_pageblock(struct page *page)
>   		set_page_count(p, 0);
>   	} while (++p, --i);
>   
> +	/*
> +	 * We set the PB_migrate_skip in
> +	 * reserve_bootmem_region() for cma
> +	 * memory, clear it now.

You can fit this easily into less lines

> +	 */
> +	clear_pageblock_skip(page);
> +
>   	set_pageblock_migratetype(page, MIGRATE_CMA);
>   	set_page_refcounted(page);
>   	/* pages were reserved and not allocated */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b98f9bb28234..a7729dac0198 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5887,6 +5887,13 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
>   	if (pages && s)
>   		pr_info("Freeing %s memory: %ldK\n", s, K(pages));
>   
> +	/*
> +	 * Clear PB_migrate_skip if the memory have released
> +	 * to the buddy system.
> +	 */

... after freeing the memory to the buddy."

And maybe

if (pages) {
	if (s)
		pr_info("Freeing %s memory: %ldK\n", s, K(pages));
	clear_pageblock_skip_range(...)
}

> +	clear_pageblock_skip_range(page_to_pfn(virt_to_page(start)),
> +				   page_to_pfn(virt_to_page(end)));
> +

PHYS_PFN(virt_to_phys(start)) might look a bit nicer, not need to
get pages involved. virt_to_pfn might be even better(), but it's
not available on all archs I think.


What about free_reserved_page() ? There might be more, though 
(kimage_free_pages()). You have to take a look at all functions where we 
clear PageReserved.
Rong Qianfeng Sept. 3, 2024, 7:14 a.m. UTC | #2
Hi David,

Thanks very much for the detailed comments and explanations!

在 2024/9/2 21:45, David Hildenbrand 写道:
> On 02.09.24 14:24, Rong Qianfeng wrote:
>> Reserved pages are basically non-lru pages. This kind of memory can't be
>> used as migration sources and targets, skip it can bring some 
>> performance
>> benefits.
>
> Any numbers? :)

I am still thinking about how to design test cases. If you have any good
suggestions,  please tell me. Thank you very much.

>> +
>> +extern void set_pageblock_skip_range(unsigned long start_pfn,
>> +                                  unsigned long end_pfn);
>
> two tabs indentation on the second line please. Applies to all others as
> well.

Got it. Will do in the next version.

>>
>> +     /*
>> +      * We set the PB_migrate_skip in
>> +      * reserve_bootmem_region() for cma
>> +      * memory, clear it now.
>
> You can fit this easily into less lines

Will do in the next version. Thanks.

>>
>> +     /*
>> +      * Clear PB_migrate_skip if the memory have released
>> +      * to the buddy system.
>> +      */
>
> ... after freeing the memory to the buddy."
>
> And maybe
>
> if (pages) {
>        if (s)
>                pr_info("Freeing %s memory: %ldK\n", s, K(pages));
>        clear_pageblock_skip_range(...)
> }
>
>> + clear_pageblock_skip_range(page_to_pfn(virt_to_page(start)),
>> + page_to_pfn(virt_to_page(end)));
>> +
>
> PHYS_PFN(virt_to_phys(start)) might look a bit nicer, not need to
> get pages involved. virt_to_pfn might be even better(), but it's
> not available on all archs I think.

You are right, I tried to use virt_to_pfn, but later found out that it 
is not
supported on x86.

>
>
> What about free_reserved_page() ? There might be more, though
> (kimage_free_pages()). You have to take a look at all functions where we
> clear PageReserved.

Thanks for your reminder, I found that I missed a lot of functions.
Maybe a better choice is to clear PB_migrate_skip in free_reserved_page()
to reduce the amount of modification.

Best Regards,
Qianfeng
David Hildenbrand Sept. 3, 2024, 9:56 a.m. UTC | #3
On 03.09.24 09:14, Rong Qianfeng wrote:
> Hi David,
> 
> Thanks very much for the detailed comments and explanations!
> 
> 在 2024/9/2 21:45, David Hildenbrand 写道:
>> On 02.09.24 14:24, Rong Qianfeng wrote:
>>> Reserved pages are basically non-lru pages. This kind of memory can't be
>>> used as migration sources and targets, skip it can bring some
>>> performance
>>> benefits.
>>
>> Any numbers? :)
> 
> I am still thinking about how to design test cases. If you have any good
> suggestions,  please tell me. Thank you very much.

Well, you claim that it can bring performance improvement, so it's your 
responsibility to prove it :)

I have real idea how you could measure that, sorry.

This change will make the code more complicated (and as raised, there 
are some corner cases not handled yet). So it's better worth the price. :)

[...]

> 
>>
>>
>> What about free_reserved_page() ? There might be more, though
>> (kimage_free_pages()). You have to take a look at all functions where we
>> clear PageReserved.
> 
> Thanks for your reminder, I found that I missed a lot of functions.
> Maybe a better choice is to clear PB_migrate_skip in free_reserved_page()
> to reduce the amount of modification.

Hm, maybe. At least for free_reserved_area() it would be beneficial to 
minimize the per-page handling.
Mel Gorman Sept. 4, 2024, 11:13 a.m. UTC | #4
On Mon, Sep 02, 2024 at 08:24:43PM +0800, Rong Qianfeng wrote:
> Reserved pages are basically non-lru pages. This kind of memory can't be
> used as migration sources and targets, skip it can bring some performance
> benefits.
> 
> Because some drivers may also use PG_reserved, we just set PB_migrate_skip
> for those clustered reserved bootmem during memory initialization.
> 
> Signed-off-by: Rong Qianfeng <rongqianfeng@vivo.com>

I'm not convinced the savings due to skipping a few pages during the scan
would justify the additional code. There would have to be a large number
of reserved pages scattered throughout the zone to make a difference and
even that situation would be a big surprise. I'm not even sure this can be
explicitly tested unless you artifically create reserved pages throughout the
zone, which would not be convincing, or know if a driver that exhibits such
behaviour in which case my first question is -- what is that driver doing?!?
Rong Qianfeng Sept. 4, 2024, 11:59 a.m. UTC | #5
Hi Mel,

在 2024/9/4 19:13, Mel Gorman 写道:
> On Mon, Sep 02, 2024 at 08:24:43PM +0800, Rong Qianfeng wrote:
>> Reserved pages are basically non-lru pages. This kind of memory can't be
>> used as migration sources and targets, skip it can bring some performance
>> benefits.
>>
>> Because some drivers may also use PG_reserved, we just set PB_migrate_skip
>> for those clustered reserved bootmem during memory initialization.
>>
>> Signed-off-by: Rong Qianfeng <rongqianfeng@vivo.com>
> I'm not convinced the savings due to skipping a few pages during the scan
> would justify the additional code. There would have to be a large number
> of reserved pages scattered throughout the zone to make a difference and
> even that situation would be a big surprise. I'm not even sure this can be
> explicitly tested unless you artifically create reserved pages throughout the
> zone, which would not be convincing, or know if a driver that exhibits such
> behaviour in which case my first question is -- what is that driver doing?!?

Thanks for taking the time to reply.

At first I thought that there was not much PageReserved pages, but when I
looked at the memory initialization code, I found that no-map pages were
also marked as PageReserved.  On mobile platforms, there is a lot of no-map
pages (for example, ARM64 MT6991 no-map pages has 1065MB).  These
pages are usually used by various hardware subsystems such as modem.  So
I think it makes sense to skip these pages.


//no-map and  reserved memory marked as PageReserved
static void __init memmap_init_reserved_pages(void)
{
...
     for_each_mem_region(region) {
...
         if (memblock_is_nomap(region))
             reserve_bootmem_region(start, end, nid);  //for no-map memory

         memblock_set_node(start, end, &memblock.reserved, nid);
     }

     for_each_reserved_mem_region(region) {
         if (!memblock_is_reserved_noinit(region)) {
...
             reserve_bootmem_region(start, end, nid); //for reserved memory
         }
     }

}

Best Regards,
Qianfeng
kernel test robot Sept. 4, 2024, 2:54 p.m. UTC | #6
Hello,

kernel test robot noticed "kernel_BUG_at_arch/x86/mm/physaddr.c" on:

commit: f28b14e8a99199dd7a5a22c621168f9c8788352a ("[PATCH] mm: Skip the reserved bootmem for compaction")
url: https://github.com/intel-lab-lkp/linux/commits/Rong-Qianfeng/mm-Skip-the-reserved-bootmem-for-compaction/20240902-202706
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/20240902122445.11805-1-rongqianfeng@vivo.com/
patch subject: [PATCH] mm: Skip the reserved bootmem for compaction

in testcase: boot

compiler: clang-18
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)


+------------------------------------------+------------+------------+
|                                          | b69256811d | f28b14e8a9 |
+------------------------------------------+------------+------------+
| boot_successes                           | 12         | 0          |
| boot_failures                            | 0          | 12         |
| kernel_BUG_at_arch/x86/mm/physaddr.c     | 0          | 12         |
| Oops:invalid_opcode:#[##]PREEMPT_SMP     | 0          | 12         |
| EIP:__phys_addr                          | 0          | 12         |
| Kernel_panic-not_syncing:Fatal_exception | 0          | 12         |
+------------------------------------------+------------+------------+


If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202409042247.af2035c5-lkp@intel.com


[    7.664934][   T32] ------------[ cut here ]------------
[    7.665373][   T32] kernel BUG at arch/x86/mm/physaddr.c:81!
[    7.665816][   T32] Oops: invalid opcode: 0000 [#1] PREEMPT SMP
[    7.666289][   T32] CPU: 1 UID: 0 PID: 32 Comm: kworker/u10:1 Not tainted 6.11.0-rc6-00524-gf28b14e8a991 #1
[    7.667024][   T32] Workqueue: async async_run_entry_fn
[ 7.667427][ T32] EIP: __phys_addr (arch/x86/mm/physaddr.c:81) 
[ 7.667767][ T32] Code: 5e 5f 5d 31 c9 c3 0f 0b 68 18 6c f9 c1 e8 5a 69 4f 00 0f 0b 68 28 6c f9 c1 e8 4e 69 4f 00 0f 0b 68 38 6c f9 c1 e8 42 69 4f 00 <0f> 0b 68 48 6c f9 c1 e8 36 69 4f 00 90 90 90 90 90 90 3d 00 00 00
All code
========
   0:	5e                   	pop    %rsi
   1:	5f                   	pop    %rdi
   2:	5d                   	pop    %rbp
   3:	31 c9                	xor    %ecx,%ecx
   5:	c3                   	ret
   6:	0f 0b                	ud2
   8:	68 18 6c f9 c1       	push   $0xffffffffc1f96c18
   d:	e8 5a 69 4f 00       	call   0x4f696c
  12:	0f 0b                	ud2
  14:	68 28 6c f9 c1       	push   $0xffffffffc1f96c28
  19:	e8 4e 69 4f 00       	call   0x4f696c
  1e:	0f 0b                	ud2
  20:	68 38 6c f9 c1       	push   $0xffffffffc1f96c38
  25:	e8 42 69 4f 00       	call   0x4f696c
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	68 48 6c f9 c1       	push   $0xffffffffc1f96c48
  31:	e8 36 69 4f 00       	call   0x4f696c
  36:	90                   	nop
  37:	90                   	nop
  38:	90                   	nop
  39:	90                   	nop
  3a:	90                   	nop
  3b:	90                   	nop
  3c:	3d                   	.byte 0x3d
  3d:	00 00                	add    %al,(%rax)
	...

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	68 48 6c f9 c1       	push   $0xffffffffc1f96c48
   7:	e8 36 69 4f 00       	call   0x4f6942
   c:	90                   	nop
   d:	90                   	nop
   e:	90                   	nop
   f:	90                   	nop
  10:	90                   	nop
  11:	90                   	nop
  12:	3d                   	.byte 0x3d
  13:	00 00                	add    %al,(%rax)
	...
[    7.669203][   T32] EAX: 00000000 EBX: 0000fd5f ECX: 00000000 EDX: 00000000
[    7.669353][   T32] ESI: 2e7fe000 EDI: ee7fe000 EBP: c3223e74 ESP: c3223e6c
[    7.669353][   T32] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010287
[    7.669353][   T32] CR0: 80050033 CR2: 00000000 CR3: 02617000 CR4: 00040690
[    7.669353][   T32] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[    7.669353][   T32] DR6: fffe0ff0 DR7: 00000400
[    7.669353][   T32] Call Trace:
[ 7.669353][ T32] ? __die_body (arch/x86/kernel/dumpstack.c:478 arch/x86/kernel/dumpstack.c:420) 
[ 7.669353][ T32] ? die (arch/x86/kernel/dumpstack.c:? arch/x86/kernel/dumpstack.c:447) 
[ 7.669353][ T32] ? do_trap (arch/x86/kernel/traps.c:? arch/x86/kernel/traps.c:155) 
[ 7.669353][ T32] ? do_error_trap (arch/x86/kernel/traps.c:175) 
[ 7.669353][ T32] ? __phys_addr (arch/x86/mm/physaddr.c:81) 
[ 7.669353][ T32] ? exc_overflow (arch/x86/kernel/traps.c:252) 
[ 7.669353][ T32] ? handle_invalid_op (arch/x86/kernel/traps.c:212) 
[ 7.669353][ T32] ? __phys_addr (arch/x86/mm/physaddr.c:81) 
[ 7.669353][ T32] ? exc_invalid_op (arch/x86/kernel/traps.c:267) 
[ 7.669353][ T32] ? handle_exception (arch/x86/entry/entry_32.S:1047) 
[ 7.669353][ T32] ? exc_overflow (arch/x86/kernel/traps.c:252) 
[ 7.669353][ T32] ? __phys_addr (arch/x86/mm/physaddr.c:81) 
[ 7.669353][ T32] ? exc_overflow (arch/x86/kernel/traps.c:252) 
[ 7.669353][ T32] ? __phys_addr (arch/x86/mm/physaddr.c:81) 
[ 7.669353][ T32] free_reserved_area (mm/page_alloc.c:5895) 
[ 7.669353][ T32] free_init_pages (arch/x86/mm/init.c:927) 
[ 7.669353][ T32] ? populate_rootfs (init/initramfs.c:691) 
[ 7.669353][ T32] free_initrd_mem (arch/x86/mm/init.c:987) 
[ 7.669353][ T32] do_populate_rootfs (init/initramfs.c:?) 
[ 7.669353][ T32] async_run_entry_fn (kernel/async.c:136) 
[ 7.669353][ T32] process_one_work (kernel/workqueue.c:3236) 
[ 7.669353][ T32] worker_thread (kernel/workqueue.c:3306 kernel/workqueue.c:3389) 
[ 7.669353][ T32] kthread (kernel/kthread.c:391) 
[ 7.669353][ T32] ? pr_cont_work (kernel/workqueue.c:3339) 
[ 7.669353][ T32] ? kthread_unuse_mm (kernel/kthread.c:342) 
[ 7.669353][ T32] ret_from_fork (arch/x86/kernel/process.c:153) 
[ 7.669353][ T32] ret_from_fork_asm (arch/x86/entry/entry_32.S:737) 
[ 7.669353][ T32] entry_INT80_32 (arch/x86/entry/entry_32.S:944) 
[    7.669353][   T32] Modules linked in:
[    7.683090][   T32] ---[ end trace 0000000000000000 ]---
[ 7.683500][ T32] EIP: __phys_addr (arch/x86/mm/physaddr.c:81) 
[ 7.683844][ T32] Code: 5e 5f 5d 31 c9 c3 0f 0b 68 18 6c f9 c1 e8 5a 69 4f 00 0f 0b 68 28 6c f9 c1 e8 4e 69 4f 00 0f 0b 68 38 6c f9 c1 e8 42 69 4f 00 <0f> 0b 68 48 6c f9 c1 e8 36 69 4f 00 90 90 90 90 90 90 3d 00 00 00
All code
========
   0:	5e                   	pop    %rsi
   1:	5f                   	pop    %rdi
   2:	5d                   	pop    %rbp
   3:	31 c9                	xor    %ecx,%ecx
   5:	c3                   	ret
   6:	0f 0b                	ud2
   8:	68 18 6c f9 c1       	push   $0xffffffffc1f96c18
   d:	e8 5a 69 4f 00       	call   0x4f696c
  12:	0f 0b                	ud2
  14:	68 28 6c f9 c1       	push   $0xffffffffc1f96c28
  19:	e8 4e 69 4f 00       	call   0x4f696c
  1e:	0f 0b                	ud2
  20:	68 38 6c f9 c1       	push   $0xffffffffc1f96c38
  25:	e8 42 69 4f 00       	call   0x4f696c
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	68 48 6c f9 c1       	push   $0xffffffffc1f96c48
  31:	e8 36 69 4f 00       	call   0x4f696c
  36:	90                   	nop
  37:	90                   	nop
  38:	90                   	nop
  39:	90                   	nop
  3a:	90                   	nop
  3b:	90                   	nop
  3c:	3d                   	.byte 0x3d
  3d:	00 00                	add    %al,(%rax)
	...

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	68 48 6c f9 c1       	push   $0xffffffffc1f96c48
   7:	e8 36 69 4f 00       	call   0x4f6942
   c:	90                   	nop
   d:	90                   	nop
   e:	90                   	nop
   f:	90                   	nop
  10:	90                   	nop
  11:	90                   	nop
  12:	3d                   	.byte 0x3d
  13:	00 00                	add    %al,(%rax)


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240904/202409042247.af2035c5-lkp@intel.com
Mike Rapoport Sept. 4, 2024, 3:38 p.m. UTC | #7
On Wed, Sep 04, 2024 at 07:59:37PM +0800, Rong Qianfeng wrote:
> Hi Mel,
> 
> 在 2024/9/4 19:13, Mel Gorman 写道:
> > On Mon, Sep 02, 2024 at 08:24:43PM +0800, Rong Qianfeng wrote:
> > > Reserved pages are basically non-lru pages. This kind of memory can't be
> > > used as migration sources and targets, skip it can bring some performance
> > > benefits.
> > > 
> > > Because some drivers may also use PG_reserved, we just set PB_migrate_skip
> > > for those clustered reserved bootmem during memory initialization.
> > > 
> > > Signed-off-by: Rong Qianfeng <rongqianfeng@vivo.com>
> > I'm not convinced the savings due to skipping a few pages during the scan
> > would justify the additional code. There would have to be a large number
> > of reserved pages scattered throughout the zone to make a difference and
> > even that situation would be a big surprise. I'm not even sure this can be
> > explicitly tested unless you artifically create reserved pages throughout the
> > zone, which would not be convincing, or know if a driver that exhibits such
> > behaviour in which case my first question is -- what is that driver doing?!?
> 
> Thanks for taking the time to reply.
> 
> At first I thought that there was not much PageReserved pages, but when I
> looked at the memory initialization code, I found that no-map pages were
> also marked as PageReserved.  On mobile platforms, there is a lot of no-map
> pages (for example, ARM64 MT6991 no-map pages has 1065MB).  These
> pages are usually used by various hardware subsystems such as modem.  So
> I think it makes sense to skip these pages.
> 
> 
> //no-map and  reserved memory marked as PageReserved
> static void __init memmap_init_reserved_pages(void)
> {
> ...
>     for_each_mem_region(region) {
> ...
>         if (memblock_is_nomap(region))
>             reserve_bootmem_region(start, end, nid);  //for no-map memory

If nomap regions are a problem won't that be simpler to make all pageblocks
of a nomap region PB_migrate_skip here and leave other reserved pages
alone?

> 
>         memblock_set_node(start, end, &memblock.reserved, nid);
>     }
> 
>     for_each_reserved_mem_region(region) {
>         if (!memblock_is_reserved_noinit(region)) {
> ...
>             reserve_bootmem_region(start, end, nid); //for reserved memory
>         }
>     }
> 
> }
> 
> Best Regards,
> Qianfeng
Rong Qianfeng Sept. 5, 2024, 3:10 a.m. UTC | #8
Hi Mike,

在 2024/9/4 23:38, Mike Rapoport 写道:
> On Wed, Sep 04, 2024 at 07:59:37PM +0800, Rong Qianfeng wrote:
>> Hi Mel,
>>
>> 在 2024/9/4 19:13, Mel Gorman 写道:
>>> On Mon, Sep 02, 2024 at 08:24:43PM +0800, Rong Qianfeng wrote:
>>>> Reserved pages are basically non-lru pages. This kind of memory can't be
>>>> used as migration sources and targets, skip it can bring some performance
>>>> benefits.
>>>>
>>>> Because some drivers may also use PG_reserved, we just set PB_migrate_skip
>>>> for those clustered reserved bootmem during memory initialization.
>>>>
>>>> Signed-off-by: Rong Qianfeng <rongqianfeng@vivo.com>
>>> I'm not convinced the savings due to skipping a few pages during the scan
>>> would justify the additional code. There would have to be a large number
>>> of reserved pages scattered throughout the zone to make a difference and
>>> even that situation would be a big surprise. I'm not even sure this can be
>>> explicitly tested unless you artifically create reserved pages throughout the
>>> zone, which would not be convincing, or know if a driver that exhibits such
>>> behaviour in which case my first question is -- what is that driver doing?!?
>> Thanks for taking the time to reply.
>>
>> At first I thought that there was not much PageReserved pages, but when I
>> looked at the memory initialization code, I found that no-map pages were
>> also marked as PageReserved.  On mobile platforms, there is a lot of no-map
>> pages (for example, ARM64 MT6991 no-map pages has 1065MB).  These
>> pages are usually used by various hardware subsystems such as modem.  So
>> I think it makes sense to skip these pages.
>>
>>
>> //no-map and  reserved memory marked as PageReserved
>> static void __init memmap_init_reserved_pages(void)
>> {
>> ...
>>      for_each_mem_region(region) {
>> ...
>>          if (memblock_is_nomap(region))
>>              reserve_bootmem_region(start, end, nid);  //for no-map memory
> If nomap regions are a problem won't that be simpler to make all pageblocks
> of a nomap region PB_migrate_skip here and leave other reserved pages
> alone?

Sorry, maybe my explanation confused you. I didn't mean to say that the 
root of
the problem comes from the no-map region. I just gave a special example. 
There
may be a lot of reserved pages on some machines, because in DTS, you can use
the "no-map" attribute to specify a piece of memory as a no-map region, and
you can also use "reusable" and "shared-dma-pool" to specify a piece of 
memory
as a reserved region.

Sorry again, "ARM64 MT6991 no-map pages has 1065MB" I counted it wrongly.
1065MB includes the memory occupied by struct page, kernel code, kernel 
data,
etc. (these are actually reserved memory). Let's use ARM64 MT6991 16GB RAM
device as an example. The actual no-map memory is about 700MB, and the
reserved memory is about 1GB.

>
>>          memblock_set_node(start, end, &memblock.reserved, nid);
>>      }
>>
>>      for_each_reserved_mem_region(region) {
>>          if (!memblock_is_reserved_noinit(region)) {
>> ...
>>              reserve_bootmem_region(start, end, nid); //for reserved memory
>>          }
>>      }
>>
>> }
>>
>> Best Regards,
>> Qianfeng
diff mbox series

Patch

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index fc6b9c87cb0a..63c5b0c69c1a 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -86,6 +86,11 @@  void set_pfnblock_flags_mask(struct page *page,
 	set_pfnblock_flags_mask(page, (1 << PB_migrate_skip),	\
 			page_to_pfn(page),			\
 			(1 << PB_migrate_skip))
+
+extern void set_pageblock_skip_range(unsigned long start_pfn,
+				     unsigned long end_pfn);
+extern void clear_pageblock_skip_range(unsigned long start_pfn,
+				       unsigned long end_pfn);
 #else
 static inline bool get_pageblock_skip(struct page *page)
 {
@@ -97,6 +102,14 @@  static inline void clear_pageblock_skip(struct page *page)
 static inline void set_pageblock_skip(struct page *page)
 {
 }
+static inline void set_pageblock_skip_range(unsigned long start_pfn,
+					    unsigned long end_pfn)
+{
+}
+static inline void clear_pageblock_skip_range(unsigned long start_pfn,
+					      unsigned long end_pfn)
+{
+}
 #endif /* CONFIG_COMPACTION */
 
 #endif	/* PAGEBLOCK_FLAGS_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index f2af4493a878..7861588b34f3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -286,6 +286,46 @@  static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
 }
 #endif
 
+/*
+ * This function is currently used to set PB_migrate_skip for the reserved
+ * bootmem which can't be used as migration sources and targets(except CMA).
+ */
+void set_pageblock_skip_range(unsigned long start_pfn,
+			      unsigned long end_pfn)
+{
+	unsigned long pfn;
+
+	start_pfn = ALIGN(start_pfn, pageblock_nr_pages);
+	end_pfn = ALIGN_DOWN(end_pfn, pageblock_nr_pages);
+
+	for (pfn = start_pfn; pfn < end_pfn;
+				pfn += pageblock_nr_pages) {
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+
+			set_pageblock_skip(page);
+		}
+	}
+}
+
+void clear_pageblock_skip_range(unsigned long start_pfn,
+				unsigned long end_pfn)
+{
+	unsigned long pfn;
+
+	start_pfn = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+	end_pfn = ALIGN(end_pfn, pageblock_nr_pages);
+
+	for (pfn = start_pfn; pfn < end_pfn;
+				pfn += pageblock_nr_pages) {
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+
+			clear_pageblock_skip(page);
+		}
+	}
+}
+
 /*
  * Compound pages of >= pageblock_order should consistently be skipped until
  * released. It is always pointless to compact pages of such order (if they are
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4ba5607aaf19..8b7dc8e00bf1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -768,6 +768,13 @@  void __meminit reserve_bootmem_region(phys_addr_t start,
 			__SetPageReserved(page);
 		}
 	}
+
+	/*
+	 * Set PB_migrate_skip for reserved region. for cma memory
+	 * and the memory released by free_reserved_area(), we will
+	 * clear PB_migrate_skip when they are initialized.
+	 */
+	set_pageblock_skip_range(start_pfn, end_pfn);
 }
 
 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
@@ -2236,6 +2243,13 @@  void __init init_cma_reserved_pageblock(struct page *page)
 		set_page_count(p, 0);
 	} while (++p, --i);
 
+	/*
+	 * We set the PB_migrate_skip in
+	 * reserve_bootmem_region() for cma
+	 * memory, clear it now.
+	 */
+	clear_pageblock_skip(page);
+
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	set_page_refcounted(page);
 	/* pages were reserved and not allocated */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b98f9bb28234..a7729dac0198 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5887,6 +5887,13 @@  unsigned long free_reserved_area(void *start, void *end, int poison, const char
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK\n", s, K(pages));
 
+	/*
+	 * Clear PB_migrate_skip if the memory have released
+	 * to the buddy system.
+	 */
+	clear_pageblock_skip_range(page_to_pfn(virt_to_page(start)),
+				   page_to_pfn(virt_to_page(end)));
+
 	return pages;
 }