diff mbox series

[-next,5/7] mm: memory: convert wp_page_copy() to use folios

Message ID 20230112083006.163393-6-wangkefeng.wang@huawei.com (mailing list archive)
State New
Headers show
Series mm: remove cgroup_throttle_swaprate() completely | expand

Commit Message

Kefeng Wang Jan. 12, 2023, 8:30 a.m. UTC
The old_page/new_page are converted to old_folio/new_folio in
wp_page_copy(), then replaced related page functions to folio
functions.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/memory.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

Comments

Marek Szyprowski Jan. 13, 2023, 1:01 p.m. UTC | #1
Hi

On 12.01.2023 09:30, Kefeng Wang wrote:
> The old_page/new_page are converted to old_folio/new_folio in
> wp_page_copy(), then replaced related page functions to folio
> functions.
>
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>

This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm: 
memory: convert wp_page_copy() to use folios"), causes serious stability 
issues on my ARM based test boards. Here is the example of such crash:

VFS: Mounted root (ext4 filesystem) readonly on device 179:6.
devtmpfs: mounted
Freeing unused kernel image (initmem) memory: 1024K
Run /sbin/init as init process
8<--- cut here ---
Unable to handle kernel NULL pointer dereference at virtual address 
00000004 when read
[00000004] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 PID: 1 Comm: init Not tainted 6.2.0-rc3-00294-g9ebae00c8e30 #13254
Hardware name: Samsung Exynos (Flattened Device Tree)
PC is at do_wp_page+0x21c/0xd48
LR is at do_wp_page+0x1f8/0xd48
pc : [<c02aa518>]    lr : [<c02aa4f4>]    psr: 60000113
sp : f082de58  ip : 0006fff8  fp : 0000065f
r10: 00000000  r9 : 00000c73  r8 : c2b87318
r7 : c1d78000  r6 : b6ed9000  r5 : 00000000  r4 : f082dedc
r3 : c25d0000  r2 : 00000001  r1 : c0ee9568  r0 : 00000000
Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 4363804a  DAC: 00000051
Register r0 information: NULL pointer
Register r1 information: non-slab/vmalloc memory
Register r2 information: non-paged memory
Register r3 information: slab mm_struct start c25d0000 pointer offset 0 
size 908
Register r4 information: 2-page vmalloc region starting at 0xf082c000 
allocated at kernel_clone+0x54/0x3e4
Register r5 information: NULL pointer
Register r6 information: non-paged memory
Register r7 information: slab task_struct start c1d78000 pointer offset 
0 size 4032
Register r8 information: slab vm_area_struct start c2b87318 pointer 
offset 0 size 68
Register r9 information: non-paged memory
Register r10 information: NULL pointer
Register r11 information: non-paged memory
Register r12 information: non-paged memory
Process init (pid: 1, stack limit = 0x(ptrval))
Stack: (0xf082de58 to 0xf082e000)
...
  do_wp_page from handle_mm_fault+0x938/0xda8
  handle_mm_fault from do_page_fault+0x154/0x408
  do_page_fault from do_DataAbort+0x3c/0xb0
  do_DataAbort from __dabt_usr+0x58/0x60
Exception stack(0xf082dfb0 to 0xf082dff8)
dfa0:                                     00000000 00000001 b6ed9060 
00000000
dfc0: 00000000 b6fea968 b6ed9060 00000000 b6cd4888 00000000 00000000 
00000000
dfe0: b6eda28c be8a9dd0 b6e6ff08 ffff0fcc 60000010 ffffffff
Code: e594a028 e58d301c e5983008 e58d3014 (e59a3004)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
CPU1: stopping
CPU: 1 PID: 0 Comm: swapper/1 Tainted: G      D 
6.2.0-rc3-00294-g9ebae00c8e30 #13254
Hardware name: Samsung Exynos (Flattened Device Tree)
  unwind_backtrace from show_stack+0x10/0x14
  show_stack from dump_stack_lvl+0x58/0x70
  dump_stack_lvl from do_handle_IPI+0x348/0x374
  do_handle_IPI from ipi_handler+0x18/0x20
  ipi_handler from handle_percpu_devid_irq+0x9c/0x170
  handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34
  generic_handle_domain_irq from gic_handle_irq+0x88/0xa8
  gic_handle_irq from generic_handle_arch_irq+0x58/0x78
  generic_handle_arch_irq from call_with_stack+0x18/0x20
  call_with_stack from __irq_svc+0x9c/0xd0
Exception stack(0xf08a9ee0 to 0xf08a9f28)
...
  __irq_svc from cpuidle_enter_state+0x318/0x3d0
  cpuidle_enter_state from cpuidle_enter_state_coupled+0x160/0x400
  cpuidle_enter_state_coupled from cpuidle_enter+0x3c/0x54
  cpuidle_enter from do_idle+0x1f0/0x2b0
  do_idle from cpu_startup_entry+0x18/0x1c
  cpu_startup_entry from secondary_start_kernel+0x1a0/0x230
  secondary_start_kernel from 0x40101a00
---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0000000b ]---

Reverting it on top of linux-next 20220113 together with aaf3f7afbf10 
("mm: swap: remove unneeded cgroup_throttle_swaprate()") fixes the 
stability issues.


> ---
>   mm/memory.c | 47 +++++++++++++++++++++++++----------------------
>   1 file changed, 25 insertions(+), 22 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index b66c425b4d7c..746f485366e8 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3044,7 +3044,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   	struct vm_area_struct *vma = vmf->vma;
>   	struct mm_struct *mm = vma->vm_mm;
>   	struct page *old_page = vmf->page;
> +	struct folio *old_folio = page_folio(old_page);
>   	struct page *new_page = NULL;
> +	struct folio *new_folio = NULL;
>   	pte_t entry;
>   	int page_copied = 0;
>   	struct mmu_notifier_range range;
> @@ -3060,12 +3062,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   							      vmf->address);
>   		if (!new_page)
>   			goto oom;
> +		new_folio = page_folio(new_page);
>   	} else {
> -		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
> -				vmf->address);
> -		if (!new_page)
> +		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
> +					    vmf->address, false);
> +		if (!new_folio)
>   			goto oom;
> -
> +		new_page = &new_folio->page;
>   		ret = __wp_page_copy_user(new_page, old_page, vmf);
>   		if (ret) {
>   			/*
> @@ -3075,9 +3078,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   			 * from the second attempt.
>   			 * The -EHWPOISON case will not be retried.
>   			 */
> -			put_page(new_page);
> -			if (old_page)
> -				put_page(old_page);
> +			folio_put(new_folio);
> +			if (old_folio)
> +				folio_put(old_folio);
>   
>   			delayacct_wpcopy_end();
>   			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
> @@ -3085,11 +3088,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   		kmsan_copy_page_meta(new_page, old_page);
>   	}
>   
> -	if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
> +	if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
>   		goto oom_free_new;
> -	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
> +	folio_throttle_swaprate(new_folio, GFP_KERNEL);
>   
> -	__SetPageUptodate(new_page);
> +	__folio_mark_uptodate(new_folio);
>   
>   	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
>   				vmf->address & PAGE_MASK,
> @@ -3101,8 +3104,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   	 */
>   	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
>   	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> -		if (old_page) {
> -			if (!PageAnon(old_page)) {
> +		if (old_folio) {
> +			if (!folio_test_anon(old_folio)) {
>   				dec_mm_counter(mm, mm_counter_file(old_page));
>   				inc_mm_counter(mm, MM_ANONPAGES);
>   			}
> @@ -3130,7 +3133,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   		 */
>   		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
>   		page_add_new_anon_rmap(new_page, vma, vmf->address);
> -		lru_cache_add_inactive_or_unevictable(new_page, vma);
> +		folio_add_lru_vma(new_folio, vma);
>   		/*
>   		 * We call the notify macro here because, when using secondary
>   		 * mmu page tables (such as kvm shadow page tables), we want the
> @@ -3139,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   		BUG_ON(unshare && pte_write(entry));
>   		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
>   		update_mmu_cache(vma, vmf->address, vmf->pte);
> -		if (old_page) {
> +		if (old_folio) {
>   			/*
>   			 * Only after switching the pte to the new page may
>   			 * we remove the mapcount here. Otherwise another
> @@ -3166,14 +3169,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   		}
>   
>   		/* Free the old page.. */
> -		new_page = old_page;
> +		new_folio = old_folio;
>   		page_copied = 1;
>   	} else {
>   		update_mmu_tlb(vma, vmf->address, vmf->pte);
>   	}
>   
> -	if (new_page)
> -		put_page(new_page);
> +	if (new_folio)
> +		folio_put(new_folio);
>   
>   	pte_unmap_unlock(vmf->pte, vmf->ptl);
>   	/*
> @@ -3181,19 +3184,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   	 * the above ptep_clear_flush_notify() did already call it.
>   	 */
>   	mmu_notifier_invalidate_range_only_end(&range);
> -	if (old_page) {
> +	if (old_folio) {
>   		if (page_copied)
>   			free_swap_cache(old_page);
> -		put_page(old_page);
> +		folio_put(old_folio);
>   	}
>   
>   	delayacct_wpcopy_end();
>   	return 0;
>   oom_free_new:
> -	put_page(new_page);
> +	folio_put(new_folio);
>   oom:
> -	if (old_page)
> -		put_page(old_page);
> +	if (old_folio)
> +		folio_put(old_folio);
>   
>   	delayacct_wpcopy_end();
>   	return VM_FAULT_OOM;

Best regards
David Hildenbrand Jan. 13, 2023, 1:08 p.m. UTC | #2
On 13.01.23 14:01, Marek Szyprowski wrote:
> Hi
> 
> On 12.01.2023 09:30, Kefeng Wang wrote:
>> The old_page/new_page are converted to old_folio/new_folio in
>> wp_page_copy(), then replaced related page functions to folio
>> functions.
>>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> 
> This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> memory: convert wp_page_copy() to use folios"), causes serious stability
> issues on my ARM based test boards. Here is the example of such crash:

syzbot is also not happy:

https://lkml.kernel.org/r/000000000000807c7805f2205df1@google.com
Daniel Thompson Jan. 13, 2023, 5:45 p.m. UTC | #3
On Fri, Jan 13, 2023 at 02:01:36PM +0100, Marek Szyprowski wrote:
> Hi
>
> On 12.01.2023 09:30, Kefeng Wang wrote:
> > The old_page/new_page are converted to old_folio/new_folio in
> > wp_page_copy(), then replaced related page functions to folio
> > functions.
> >
> > Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>
> This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> memory: convert wp_page_copy() to use folios"), causes serious stability
> issues on my ARM based test boards.

I'm seeing something very similar[1] and it bisected down to this patch.

FWIW it reproduces for me using an ARCH=arm64 defconfig kernel and
with the following QEMU invocation:

    qemu-system-aarch64 \
        -M virt -cpu cortex-a57 -nographic \
        -kernel arch/arm64/boot/Image -initrd rootfs.cpio.gz

The rootfs I used for reproduction is here:
https://fileserver.linaro.org/s/AKcrKWB2Jtyzo6g


Daniel.


[1]:
[    1.740416] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
[    1.740898] Mem abort info:
[    1.741067]   ESR = 0x0000000096000006
[    1.741291]   EC = 0x25: DABT (current EL), IL = 32 bits
[    1.741557]   SET = 0, FnV = 0
[    1.741734]   EA = 0, S1PTW = 0
[    1.741910]   FSC = 0x06: level 2 translation fault
[    1.742161] Data abort info:
[    1.742328]   ISV = 0, ISS = 0x00000006
[    1.742533]   CM = 0, WnR = 0
[    1.742729] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000447ff000
[    1.743041] [0000000000000008] pgd=0800000044819003, p4d=0800000044819003, pud=080000004481a003, pmd=0000000000000000
[    1.743819] Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
[    1.744118] Modules linked in:
[    1.744353] CPU: 0 PID: 44 Comm: modprobe Not tainted 6.2.0-rc3-00294-g9ebae00c8e30 #244
[    1.744617] Hardware name: linux,dummy-virt (DT)
[    1.744848] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    1.745045] pc : do_wp_page+0x2c4/0xd40
[    1.745218] lr : do_wp_page+0x2bc/0xd40
[    1.745318] sp : ffff80000822bc10
[    1.745415] x29: ffff80000822bc10 x28: ffff60710459b100 x27: 0000000000000002
[    1.745633] x26: ffff80000822bd28 x25: ffff607103b18000 x24: 0000000200100073
[    1.745815] x23: ffff607104811ff0 x22: ffffc6250d418000 x21: 0000000000000000
[    1.745986] x20: ffff607104810360 x19: ffff607104810360 x18: 0000000000000001
[    1.746171] x17: ffff9a4bfa8fa000 x16: ffff800008004000 x15: 0000000000000000
[    1.746363] x14: 000000000000a8a9 x13: 0000000000000004 x12: 0000000000000026
[    1.746548] x11: 0000000000000001 x10: 0000000000000000 x9 : ffffc6250e9e7000
[    1.746756] x8 : ffff60710459b100 x7 : 00000000412a6223 x6 : 0000000000000000
[    1.746928] x5 : 0000000000042cc5 x4 : 0000ffff9acb8000 x3 : ffff80000822bbe4
[    1.747095] x2 : ffff9a4bfa8fa000 x1 : ffff60710459b100 x0 : 0000000100000000
[    1.747341] Call trace:
[    1.747431]  do_wp_page+0x2c4/0xd40
[    1.747547]  __handle_mm_fault+0x704/0x1124
[    1.747649]  handle_mm_fault+0xe4/0x25c
[    1.747736]  do_page_fault+0x1e8/0x4c0
[    1.747834]  do_mem_abort+0x44/0x9c
[    1.747934]  el0_da+0x60/0xd4
[    1.748020]  el0t_64_sync_handler+0xac/0x120
[    1.748134]  el0t_64_sync+0x190/0x194
[    1.748388] Code: f9403340 943cc31e f9402b55 f9400354 (f94006b6)
[    1.748767] ---[ end trace 0000000000000000 ]---


>
> VFS: Mounted root (ext4 filesystem) readonly on device 179:6.
> devtmpfs: mounted
> Freeing unused kernel image (initmem) memory: 1024K
> Run /sbin/init as init process
> 8<--- cut here ---
> Unable to handle kernel NULL pointer dereference at virtual address
> 00000004 when read
> [00000004] *pgd=00000000
> Internal error: Oops: 5 [#1] PREEMPT SMP ARM
> Modules linked in:
> CPU: 0 PID: 1 Comm: init Not tainted 6.2.0-rc3-00294-g9ebae00c8e30 #13254
> Hardware name: Samsung Exynos (Flattened Device Tree)
> PC is at do_wp_page+0x21c/0xd48
> LR is at do_wp_page+0x1f8/0xd48
> pc : [<c02aa518>]    lr : [<c02aa4f4>]    psr: 60000113
> sp : f082de58  ip : 0006fff8  fp : 0000065f
> r10: 00000000  r9 : 00000c73  r8 : c2b87318
> r7 : c1d78000  r6 : b6ed9000  r5 : 00000000  r4 : f082dedc
> r3 : c25d0000  r2 : 00000001  r1 : c0ee9568  r0 : 00000000
> Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
> Control: 10c5387d  Table: 4363804a  DAC: 00000051
> Register r0 information: NULL pointer
> Register r1 information: non-slab/vmalloc memory
> Register r2 information: non-paged memory
> Register r3 information: slab mm_struct start c25d0000 pointer offset 0
> size 908
> Register r4 information: 2-page vmalloc region starting at 0xf082c000
> allocated at kernel_clone+0x54/0x3e4
> Register r5 information: NULL pointer
> Register r6 information: non-paged memory
> Register r7 information: slab task_struct start c1d78000 pointer offset
> 0 size 4032
> Register r8 information: slab vm_area_struct start c2b87318 pointer
> offset 0 size 68
> Register r9 information: non-paged memory
> Register r10 information: NULL pointer
> Register r11 information: non-paged memory
> Register r12 information: non-paged memory
> Process init (pid: 1, stack limit = 0x(ptrval))
> Stack: (0xf082de58 to 0xf082e000)
> ...
>   do_wp_page from handle_mm_fault+0x938/0xda8
>   handle_mm_fault from do_page_fault+0x154/0x408
>   do_page_fault from do_DataAbort+0x3c/0xb0
>   do_DataAbort from __dabt_usr+0x58/0x60
> Exception stack(0xf082dfb0 to 0xf082dff8)
> dfa0:                                     00000000 00000001 b6ed9060
> 00000000
> dfc0: 00000000 b6fea968 b6ed9060 00000000 b6cd4888 00000000 00000000
> 00000000
> dfe0: b6eda28c be8a9dd0 b6e6ff08 ffff0fcc 60000010 ffffffff
> Code: e594a028 e58d301c e5983008 e58d3014 (e59a3004)
> ---[ end trace 0000000000000000 ]---
> Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
> CPU1: stopping
> CPU: 1 PID: 0 Comm: swapper/1 Tainted: G      D
> 6.2.0-rc3-00294-g9ebae00c8e30 #13254
> Hardware name: Samsung Exynos (Flattened Device Tree)
>   unwind_backtrace from show_stack+0x10/0x14
>   show_stack from dump_stack_lvl+0x58/0x70
>   dump_stack_lvl from do_handle_IPI+0x348/0x374
>   do_handle_IPI from ipi_handler+0x18/0x20
>   ipi_handler from handle_percpu_devid_irq+0x9c/0x170
>   handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34
>   generic_handle_domain_irq from gic_handle_irq+0x88/0xa8
>   gic_handle_irq from generic_handle_arch_irq+0x58/0x78
>   generic_handle_arch_irq from call_with_stack+0x18/0x20
>   call_with_stack from __irq_svc+0x9c/0xd0
> Exception stack(0xf08a9ee0 to 0xf08a9f28)
> ...
>   __irq_svc from cpuidle_enter_state+0x318/0x3d0
>   cpuidle_enter_state from cpuidle_enter_state_coupled+0x160/0x400
>   cpuidle_enter_state_coupled from cpuidle_enter+0x3c/0x54
>   cpuidle_enter from do_idle+0x1f0/0x2b0
>   do_idle from cpu_startup_entry+0x18/0x1c
>   cpu_startup_entry from secondary_start_kernel+0x1a0/0x230
>   secondary_start_kernel from 0x40101a00
> ---[ end Kernel panic - not syncing: Attempted to kill init!
> exitcode=0x0000000b ]---
>
> Reverting it on top of linux-next 20220113 together with aaf3f7afbf10
> ("mm: swap: remove unneeded cgroup_throttle_swaprate()") fixes the
> stability issues.
>
>
> > ---
> >   mm/memory.c | 47 +++++++++++++++++++++++++----------------------
> >   1 file changed, 25 insertions(+), 22 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index b66c425b4d7c..746f485366e8 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3044,7 +3044,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   	struct vm_area_struct *vma = vmf->vma;
> >   	struct mm_struct *mm = vma->vm_mm;
> >   	struct page *old_page = vmf->page;
> > +	struct folio *old_folio = page_folio(old_page);
> >   	struct page *new_page = NULL;
> > +	struct folio *new_folio = NULL;
> >   	pte_t entry;
> >   	int page_copied = 0;
> >   	struct mmu_notifier_range range;
> > @@ -3060,12 +3062,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   							      vmf->address);
> >   		if (!new_page)
> >   			goto oom;
> > +		new_folio = page_folio(new_page);
> >   	} else {
> > -		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
> > -				vmf->address);
> > -		if (!new_page)
> > +		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
> > +					    vmf->address, false);
> > +		if (!new_folio)
> >   			goto oom;
> > -
> > +		new_page = &new_folio->page;
> >   		ret = __wp_page_copy_user(new_page, old_page, vmf);
> >   		if (ret) {
> >   			/*
> > @@ -3075,9 +3078,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   			 * from the second attempt.
> >   			 * The -EHWPOISON case will not be retried.
> >   			 */
> > -			put_page(new_page);
> > -			if (old_page)
> > -				put_page(old_page);
> > +			folio_put(new_folio);
> > +			if (old_folio)
> > +				folio_put(old_folio);
> >
> >   			delayacct_wpcopy_end();
> >   			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
> > @@ -3085,11 +3088,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   		kmsan_copy_page_meta(new_page, old_page);
> >   	}
> >
> > -	if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
> > +	if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
> >   		goto oom_free_new;
> > -	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
> > +	folio_throttle_swaprate(new_folio, GFP_KERNEL);
> >
> > -	__SetPageUptodate(new_page);
> > +	__folio_mark_uptodate(new_folio);
> >
> >   	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> >   				vmf->address & PAGE_MASK,
> > @@ -3101,8 +3104,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   	 */
> >   	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
> >   	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> > -		if (old_page) {
> > -			if (!PageAnon(old_page)) {
> > +		if (old_folio) {
> > +			if (!folio_test_anon(old_folio)) {
> >   				dec_mm_counter(mm, mm_counter_file(old_page));
> >   				inc_mm_counter(mm, MM_ANONPAGES);
> >   			}
> > @@ -3130,7 +3133,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   		 */
> >   		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
> >   		page_add_new_anon_rmap(new_page, vma, vmf->address);
> > -		lru_cache_add_inactive_or_unevictable(new_page, vma);
> > +		folio_add_lru_vma(new_folio, vma);
> >   		/*
> >   		 * We call the notify macro here because, when using secondary
> >   		 * mmu page tables (such as kvm shadow page tables), we want the
> > @@ -3139,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   		BUG_ON(unshare && pte_write(entry));
> >   		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
> >   		update_mmu_cache(vma, vmf->address, vmf->pte);
> > -		if (old_page) {
> > +		if (old_folio) {
> >   			/*
> >   			 * Only after switching the pte to the new page may
> >   			 * we remove the mapcount here. Otherwise another
> > @@ -3166,14 +3169,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   		}
> >
> >   		/* Free the old page.. */
> > -		new_page = old_page;
> > +		new_folio = old_folio;
> >   		page_copied = 1;
> >   	} else {
> >   		update_mmu_tlb(vma, vmf->address, vmf->pte);
> >   	}
> >
> > -	if (new_page)
> > -		put_page(new_page);
> > +	if (new_folio)
> > +		folio_put(new_folio);
> >
> >   	pte_unmap_unlock(vmf->pte, vmf->ptl);
> >   	/*
> > @@ -3181,19 +3184,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> >   	 * the above ptep_clear_flush_notify() did already call it.
> >   	 */
> >   	mmu_notifier_invalidate_range_only_end(&range);
> > -	if (old_page) {
> > +	if (old_folio) {
> >   		if (page_copied)
> >   			free_swap_cache(old_page);
> > -		put_page(old_page);
> > +		folio_put(old_folio);
> >   	}
> >
> >   	delayacct_wpcopy_end();
> >   	return 0;
> >   oom_free_new:
> > -	put_page(new_page);
> > +	folio_put(new_folio);
> >   oom:
> > -	if (old_page)
> > -		put_page(old_page);
> > +	if (old_folio)
> > +		folio_put(old_folio);
> >
> >   	delayacct_wpcopy_end();
> >   	return VM_FAULT_OOM;
>
> Best regards
> --
> Marek Szyprowski, PhD
> Samsung R&D Institute Poland
>
Lorenzo Stoakes Jan. 13, 2023, 7:04 p.m. UTC | #4
On Fri, Jan 13, 2023 at 02:08:36PM +0100, David Hildenbrand wrote:
> On 13.01.23 14:01, Marek Szyprowski wrote:
> > Hi
> >
> > On 12.01.2023 09:30, Kefeng Wang wrote:
> > > The old_page/new_page are converted to old_folio/new_folio in
> > > wp_page_copy(), then replaced related page functions to folio
> > > functions.
> > >
> > > Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> >
> > This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> > memory: convert wp_page_copy() to use folios"), causes serious stability
> > issues on my ARM based test boards. Here is the example of such crash:
>
> syzbot is also not happy:
>
> https://lkml.kernel.org/r/000000000000807c7805f2205df1@google.com
>
> --
> Thanks,
>
> David / dhildenb
>

This also completely broke my qemu environment.

In that thread Willy points out that the issue stems from blindly assigning
page_folio(old_page) to old_folio without checking whether it is NULL first,
therefore triggering a NULL pointer deref.

A quick fix would be to put in a check (as shown below) which fixes the issue,
but as Willy said, I think we should drop this until it can be fixed in a
respin.

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3044,7 +3044,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct page *old_page = vmf->page;
-       struct folio *old_folio = page_folio(old_page);
+       struct folio *old_folio = old_page ? page_folio(old_page) : NULL;
SeongJae Park Jan. 13, 2023, 10:16 p.m. UTC | #5
Hello,

On Fri, 13 Jan 2023 19:04:14 +0000 Lorenzo Stoakes <lstoakes@gmail.com> wrote:

> On Fri, Jan 13, 2023 at 02:08:36PM +0100, David Hildenbrand wrote:
> > On 13.01.23 14:01, Marek Szyprowski wrote:
> > > Hi
> > >
> > > On 12.01.2023 09:30, Kefeng Wang wrote:
> > > > The old_page/new_page are converted to old_folio/new_folio in
> > > > wp_page_copy(), then replaced related page functions to folio
> > > > functions.
> > > >
> > > > Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> > >
> > > This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> > > memory: convert wp_page_copy() to use folios"), causes serious stability
> > > issues on my ARM based test boards. Here is the example of such crash:
> >
> > syzbot is also not happy:
> >
> > https://lkml.kernel.org/r/000000000000807c7805f2205df1@google.com
> >
> > --
> > Thanks,
> >
> > David / dhildenb
> >
> 
> This also completely broke my qemu environment.

Same to me.

> 
> In that thread Willy points out that the issue stems from blindly assigning
> page_folio(old_page) to old_folio without checking whether it is NULL first,
> therefore triggering a NULL pointer deref.
> 
> A quick fix would be to put in a check (as shown below) which fixes the issue,
> but as Willy said, I think we should drop this until it can be fixed in a
> respin.
> 
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3044,7 +3044,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>         struct vm_area_struct *vma = vmf->vma;
>         struct mm_struct *mm = vma->vm_mm;
>         struct page *old_page = vmf->page;
> -       struct folio *old_folio = page_folio(old_page);
> +       struct folio *old_folio = old_page ? page_folio(old_page) : NULL;

Tested-by: SeongJae Park <sj@kernel.org>


Thanks,
SJ
Yujie Liu Jan. 15, 2023, 4:01 p.m. UTC | #6
Greeting,

FYI, we noticed BUG:kernel_NULL_pointer_dereference,address due to commit (built with gcc-11):

commit: 94dd2d69bf084166a5537f957dac6a3b79fa238f ("[PATCH -next 5/7] mm: memory: convert wp_page_copy() to use folios")
url: https://github.com/intel-lab-lkp/linux/commits/Kefeng-Wang/mm-huge_memory-make-__do_huge_pmd_anonymous_page-to-take-a-folio/20230112-161810
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/20230112083006.163393-6-wangkefeng.wang@huawei.com/
patch subject: [PATCH -next 5/7] mm: memory: convert wp_page_copy() to use folios

in testcase: boot

on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):


[    6.211602][   T64] BUG: kernel NULL pointer dereference, address: 0000000000000008
[    6.213035][   T64] #PF: supervisor read access in kernel mode
[    6.214169][   T64] #PF: error_code(0x0000) - not-present page
[    6.215275][   T64] PGD 80000001202fc067 P4D 80000001202fc067 PUD 1202f9067 PMD 0
[    6.216694][   T64] Oops: 0000 [#1] SMP PTI
[    6.217525][   T64] CPU: 1 PID: 64 Comm: modprobe Not tainted 6.2.0-rc3-00317-g94dd2d69bf08 #1
[    6.219042][   T64] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[ 6.220947][ T64] RIP: 0010:_compound_head (include/linux/page-flags.h:251) 
[ 6.221957][ T64] Code: 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <48> 8b 47 08 a8 01 75 24 66 90 48 89 f8 c3 cc cc cc cc f7 c7 ff 0f
All code
========
   0:	66 2e 0f 1f 84 00 00 	nopw   %cs:0x0(%rax,%rax,1)
   7:	00 00 00 
   a:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
  11:	00 00 00 00 
  15:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
  1a:	90                   	nop
  1b:	90                   	nop
  1c:	90                   	nop
  1d:	90                   	nop
  1e:	90                   	nop
  1f:	90                   	nop
  20:	90                   	nop
  21:	90                   	nop
  22:	90                   	nop
  23:	90                   	nop
  24:	90                   	nop
  25:	90                   	nop
  26:	90                   	nop
  27:	90                   	nop
  28:	90                   	nop
  29:	90                   	nop
  2a:*	48 8b 47 08          	mov    0x8(%rdi),%rax		<-- trapping instruction
  2e:	a8 01                	test   $0x1,%al
  30:	75 24                	jne    0x56
  32:	66 90                	xchg   %ax,%ax
  34:	48 89 f8             	mov    %rdi,%rax
  37:	c3                   	retq   
  38:	cc                   	int3   
  39:	cc                   	int3   
  3a:	cc                   	int3   
  3b:	cc                   	int3   
  3c:	f7                   	.byte 0xf7
  3d:	c7                   	(bad)  
  3e:	ff 0f                	decl   (%rdi)

Code starting with the faulting instruction
===========================================
   0:	48 8b 47 08          	mov    0x8(%rdi),%rax
   4:	a8 01                	test   $0x1,%al
   6:	75 24                	jne    0x2c
   8:	66 90                	xchg   %ax,%ax
   a:	48 89 f8             	mov    %rdi,%rax
   d:	c3                   	retq   
   e:	cc                   	int3   
   f:	cc                   	int3   
  10:	cc                   	int3   
  11:	cc                   	int3   
  12:	f7                   	.byte 0xf7
  13:	c7                   	(bad)  
  14:	ff 0f                	decl   (%rdi)
[    6.225382][   T64] RSP: 0000:ffffc900004c3d38 EFLAGS: 00010282
[    6.226529][   T64] RAX: 0000000000000a55 RBX: ffffc900004c3df8 RCX: 0000000000003663
[    6.230756][   T64] RDX: 0000000000000000 RSI: 00000000f7f80000 RDI: 0000000000000000
[    6.232224][   T64] RBP: ffff8881202f6240 R08: 8000000003663225 R09: ffff888120201000
[    6.233742][   T64] R10: 0000000000000000 R11: 00000000f7f80684 R12: 00000000f7f80684
[    6.235231][   T64] R13: 0000000000000df8 R14: 0000000000000000 R15: ffff88812024f440
[    6.236759][   T64] FS:  0000000000000000(0000) GS:ffff88842fd00000(0063) knlGS:00000000f7ddb900
[    6.238454][   T64] CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
[    6.239697][   T64] CR2: 0000000000000008 CR3: 00000001009e2000 CR4: 00000000000406e0
[    6.241215][   T64] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    6.242709][   T64] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    6.244212][   T64] Call Trace:
[    6.244933][   T64]  <TASK>
[ 6.245575][ T64] wp_page_copy (mm/memory.c:3047) 
[ 6.246461][ T64] ? do_anonymous_page (arch/x86/include/asm/preempt.h:85 include/linux/spinlock_api_smp.h:143 include/linux/spinlock.h:390 mm/memory.c:4106) 
[ 6.247440][ T64] __handle_mm_fault (mm/memory.c:5061) 
[ 6.248338][ T64] handle_mm_fault (mm/memory.c:5207) 
[ 6.249227][ T64] do_user_addr_fault (arch/x86/mm/fault.c:1407) 
[ 6.250166][ T64] ? do_set_thread_area (arch/x86/kernel/tls.c:152) 
[ 6.251133][ T64] exc_page_fault (arch/x86/include/asm/irqflags.h:40 arch/x86/include/asm/irqflags.h:75 arch/x86/mm/fault.c:1506 arch/x86/mm/fault.c:1554) 
[ 6.252009][ T64] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570) 
[    6.252931][   T64] RIP: 0023:0xf7df587c
[ 6.253753][ T64] Code: 1c 31 c0 89 5c 24 0c e8 45 d4 10 00 81 c3 96 77 18 00 89 74 24 10 87 de 0f a2 87 de 81 f9 6e 74 65 6c 89 7c 24 14 89 6c 24 18 <89> 83 90 36 00 00 75 14 81 fe 47 65 6e 75 75 0c 81 fa 69 6e 65 49
All code
========
   0:	1c 31                	sbb    $0x31,%al
   2:	c0 89 5c 24 0c e8 45 	rorb   $0x45,-0x17f3dba4(%rcx)
   9:	d4                   	(bad)  
   a:	10 00                	adc    %al,(%rax)
   c:	81 c3 96 77 18 00    	add    $0x187796,%ebx
  12:	89 74 24 10          	mov    %esi,0x10(%rsp)
  16:	87 de                	xchg   %ebx,%esi
  18:	0f a2                	cpuid  
  1a:	87 de                	xchg   %ebx,%esi
  1c:	81 f9 6e 74 65 6c    	cmp    $0x6c65746e,%ecx
  22:	89 7c 24 14          	mov    %edi,0x14(%rsp)
  26:	89 6c 24 18          	mov    %ebp,0x18(%rsp)
  2a:*	89 83 90 36 00 00    	mov    %eax,0x3690(%rbx)		<-- trapping instruction
  30:	75 14                	jne    0x46
  32:	81 fe 47 65 6e 75    	cmp    $0x756e6547,%esi
  38:	75 0c                	jne    0x46
  3a:	81 fa 69 6e 65 49    	cmp    $0x49656e69,%edx

Code starting with the faulting instruction
===========================================
   0:	89 83 90 36 00 00    	mov    %eax,0x3690(%rbx)
   6:	75 14                	jne    0x1c
   8:	81 fe 47 65 6e 75    	cmp    $0x756e6547,%esi
   e:	75 0c                	jne    0x1c
  10:	81 fa 69 6e 65 49    	cmp    $0x49656e69,%edx


If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <yujie.liu@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202301151942.4b0281d1-yujie.liu@intel.com


To reproduce:

        # build kernel
	cd linux
	cp config-6.2.0-rc3-00317-g94dd2d69bf08 .config
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
	make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
	cd <mod-install-dir>
	find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz


        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
Kefeng Wang Jan. 16, 2023, 11:35 a.m. UTC | #7
On 2023/1/14 6:16, SeongJae Park wrote:
> Hello,
> 
> On Fri, 13 Jan 2023 19:04:14 +0000 Lorenzo Stoakes <lstoakes@gmail.com> wrote:
> 
>> On Fri, Jan 13, 2023 at 02:08:36PM +0100, David Hildenbrand wrote:
>>> On 13.01.23 14:01, Marek Szyprowski wrote:
>>>> Hi
>>>>
>>>> On 12.01.2023 09:30, Kefeng Wang wrote:
>>>>> The old_page/new_page are converted to old_folio/new_folio in
>>>>> wp_page_copy(), then replaced related page functions to folio
>>>>> functions.
>>>>>
>>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>>
>>>> This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
>>>> memory: convert wp_page_copy() to use folios"), causes serious stability
>>>> issues on my ARM based test boards. Here is the example of such crash:
>>>
>>> syzbot is also not happy:
>>>
>>> https://lkml.kernel.org/r/000000000000807c7805f2205df1@google.com
>>>
>>> --
>>> Thanks,
>>>
>>> David / dhildenb
>>>
>>
>> This also completely broke my qemu environment.
> 
> Same to me.
> 
>>
>> In that thread Willy points out that the issue stems from blindly assigning
>> page_folio(old_page) to old_folio without checking whether it is NULL first,
>> therefore triggering a NULL pointer deref.
>>
>> A quick fix would be to put in a check (as shown below) which fixes the issue,
>> but as Willy said, I think we should drop this until it can be fixed in a
>> respin.

Hello all, sorry for the break, thanks all to quick fix and analysis, as 
the patch has be dropped from mm-unstable and next, will resend after 
address some comments from Matthew Wilcox and do more test.
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index b66c425b4d7c..746f485366e8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3044,7 +3044,9 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	struct mm_struct *mm = vma->vm_mm;
 	struct page *old_page = vmf->page;
+	struct folio *old_folio = page_folio(old_page);
 	struct page *new_page = NULL;
+	struct folio *new_folio = NULL;
 	pte_t entry;
 	int page_copied = 0;
 	struct mmu_notifier_range range;
@@ -3060,12 +3062,13 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 							      vmf->address);
 		if (!new_page)
 			goto oom;
+		new_folio = page_folio(new_page);
 	} else {
-		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
-				vmf->address);
-		if (!new_page)
+		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+					    vmf->address, false);
+		if (!new_folio)
 			goto oom;
-
+		new_page = &new_folio->page;
 		ret = __wp_page_copy_user(new_page, old_page, vmf);
 		if (ret) {
 			/*
@@ -3075,9 +3078,9 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			 * from the second attempt.
 			 * The -EHWPOISON case will not be retried.
 			 */
-			put_page(new_page);
-			if (old_page)
-				put_page(old_page);
+			folio_put(new_folio);
+			if (old_folio)
+				folio_put(old_folio);
 
 			delayacct_wpcopy_end();
 			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
@@ -3085,11 +3088,11 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		kmsan_copy_page_meta(new_page, old_page);
 	}
 
-	if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
+	if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
 		goto oom_free_new;
-	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+	folio_throttle_swaprate(new_folio, GFP_KERNEL);
 
-	__SetPageUptodate(new_page);
+	__folio_mark_uptodate(new_folio);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
 				vmf->address & PAGE_MASK,
@@ -3101,8 +3104,8 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	 */
 	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
 	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
-		if (old_page) {
-			if (!PageAnon(old_page)) {
+		if (old_folio) {
+			if (!folio_test_anon(old_folio)) {
 				dec_mm_counter(mm, mm_counter_file(old_page));
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
@@ -3130,7 +3133,7 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		 */
 		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
 		page_add_new_anon_rmap(new_page, vma, vmf->address);
-		lru_cache_add_inactive_or_unevictable(new_page, vma);
+		folio_add_lru_vma(new_folio, vma);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
@@ -3139,7 +3142,7 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		BUG_ON(unshare && pte_write(entry));
 		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
 		update_mmu_cache(vma, vmf->address, vmf->pte);
-		if (old_page) {
+		if (old_folio) {
 			/*
 			 * Only after switching the pte to the new page may
 			 * we remove the mapcount here. Otherwise another
@@ -3166,14 +3169,14 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 
 		/* Free the old page.. */
-		new_page = old_page;
+		new_folio = old_folio;
 		page_copied = 1;
 	} else {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);
 	}
 
-	if (new_page)
-		put_page(new_page);
+	if (new_folio)
+		folio_put(new_folio);
 
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	/*
@@ -3181,19 +3184,19 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	 * the above ptep_clear_flush_notify() did already call it.
 	 */
 	mmu_notifier_invalidate_range_only_end(&range);
-	if (old_page) {
+	if (old_folio) {
 		if (page_copied)
 			free_swap_cache(old_page);
-		put_page(old_page);
+		folio_put(old_folio);
 	}
 
 	delayacct_wpcopy_end();
 	return 0;
 oom_free_new:
-	put_page(new_page);
+	folio_put(new_folio);
 oom:
-	if (old_page)
-		put_page(old_page);
+	if (old_folio)
+		folio_put(old_folio);
 
 	delayacct_wpcopy_end();
 	return VM_FAULT_OOM;