diff mbox series

[2/2] mm: Allocate THP on hugezeropage wp-fault

Message ID 20240830084117.4079805-3-dev.jain@arm.com (mailing list archive)
State New, archived
Headers show
Series Do not shatter hugezeropage on wp-fault | expand

Commit Message

Dev Jain Aug. 30, 2024, 8:41 a.m. UTC
Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
replace it with a PMD-mapped THP. Change the helpers introduced in the
previous patch to flush TLB entry corresponding to the hugezeropage,
and preserve PMD uffd-wp marker. In case of failure, fallback to
splitting the PMD.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 include/linux/huge_mm.h |  7 ++++
 mm/huge_memory.c        | 76 +++++++++++++++++++++++++++++++++++------
 mm/memory.c             |  5 +--
 3 files changed, 76 insertions(+), 12 deletions(-)

Comments

kernel test robot Sept. 3, 2024, 8:45 a.m. UTC | #1
Hello,

kernel test robot noticed "BUG:sleeping_function_called_from_invalid_context_at_mm/memory.c" on:

commit: c636ba74f021bfe8d72845f9e53ee2b8ea16f5f8 ("[PATCH 2/2] mm: Allocate THP on hugezeropage wp-fault")
url: https://github.com/intel-lab-lkp/linux/commits/Dev-Jain/mm-Abstract-THP-allocation/20240830-164300
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/20240830084117.4079805-3-dev.jain@arm.com/
patch subject: [PATCH 2/2] mm: Allocate THP on hugezeropage wp-fault

in testcase: trinity
version: trinity-i386-abe9de86-1_20230429
with following parameters:

	runtime: 600s



compiler: gcc-12
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202409031602.14479174-lkp@intel.com


[  189.202955][T15284] BUG: sleeping function called from invalid context at mm/memory.c:6690
[  189.203611][T15284] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15284, name: trinity-c6
[  189.204103][T15284] preempt_count: 1, expected: 0
[  189.204364][T15284] RCU nest depth: 0, expected: 0
[  189.204630][T15284] 2 locks held by trinity-c6/15284:
[189.204909][T15284] #0: ffff888164fc27b0 (&mm->mmap_lock){++++}-{3:3}, at: lock_mm_and_find_vma (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/jump_label.h:261 include/linux/jump_label.h:273 include/linux/mmap_lock.h:35 include/linux/mmap_lock.h:164 mm/memory.c:6067 mm/memory.c:6127) 
[189.205536][T15284] #1: ffff888160945c48 (ptlock_ptr(ptdesc)){+.+.}-{2:2}, at: do_huge_pmd_wp_page (mm/huge_memory.c:1816 mm/huge_memory.c:1838) 
[  189.206099][T15284] CPU: 1 UID: 65534 PID: 15284 Comm: trinity-c6 Not tainted 6.11.0-rc4-00551-gc636ba74f021 #1
[  189.206657][T15284] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[  189.207205][T15284] Call Trace:
[  189.207389][T15284]  <TASK>
[189.207552][T15284] dump_stack_lvl (lib/dump_stack.c:122) 
[189.207803][T15284] __might_resched (kernel/sched/core.c:8462) 
[189.208106][T15284] ? __pfx___might_resched (kernel/sched/core.c:8416) 
[189.208397][T15284] folio_zero_user (include/linux/kernel.h:73 mm/memory.c:6690 mm/memory.c:6767) 
[189.208657][T15284] thp_fault_alloc (include/linux/page-flags.h:785 (discriminator 2) mm/huge_memory.c:1156 (discriminator 2)) 
[189.208914][T15284] do_huge_pmd_wp_page (mm/huge_memory.c:1792 mm/huge_memory.c:1818 mm/huge_memory.c:1838) 
[189.209197][T15284] ? __lock_release+0x3fe/0x860 
[189.209494][T15284] ? __pfx_do_huge_pmd_wp_page (mm/huge_memory.c:1826) 
[189.209802][T15284] __handle_mm_fault (mm/memory.c:5614 mm/memory.c:5852) 
[189.210072][T15284] ? mt_find (lib/maple_tree.c:6961) 
[189.210303][T15284] ? __pfx___handle_mm_fault (mm/memory.c:5771) 
[189.210595][T15284] ? __pfx_mt_find (lib/maple_tree.c:6927) 
[189.210875][T15284] handle_mm_fault (mm/memory.c:6042) 
[189.211134][T15284] ? __pfx_handle_mm_fault (mm/memory.c:5997) 
[189.211424][T15284] ? down_read_trylock (kernel/locking/rwsem.c:1568) 
[189.211701][T15284] ? lock_mm_and_find_vma (mm/memory.c:6130) 
[189.211997][T15284] do_user_addr_fault (arch/x86/mm/fault.c:1391) 
[189.212283][T15284] exc_page_fault (arch/x86/include/asm/irqflags.h:26 arch/x86/include/asm/irqflags.h:87 arch/x86/include/asm/irqflags.h:147 arch/x86/mm/fault.c:1489 arch/x86/mm/fault.c:1539) 
[189.212539][T15284] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:623) 
[189.212811][T15284] RIP: 0010:__put_user_4 (arch/x86/lib/putuser.S:88) 
[ 189.213091][T15284] Code: 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 89 cb 48 c1 fb 3f 48 09 d9 0f 1f 00 <89> 01 31 c9 0f 1f 00 c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90 90
All code
========
   0:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
   7:	00 
   8:	90                   	nop
   9:	90                   	nop
   a:	90                   	nop
   b:	90                   	nop
   c:	90                   	nop
   d:	90                   	nop
   e:	90                   	nop
   f:	90                   	nop
  10:	90                   	nop
  11:	90                   	nop
  12:	90                   	nop
  13:	90                   	nop
  14:	90                   	nop
  15:	90                   	nop
  16:	90                   	nop
  17:	90                   	nop
  18:	90                   	nop
  19:	f3 0f 1e fa          	endbr64
  1d:	48 89 cb             	mov    %rcx,%rbx
  20:	48 c1 fb 3f          	sar    $0x3f,%rbx
  24:	48 09 d9             	or     %rbx,%rcx
  27:	0f 1f 00             	nopl   (%rax)
  2a:*	89 01                	mov    %eax,(%rcx)		<-- trapping instruction
  2c:	31 c9                	xor    %ecx,%ecx
  2e:	0f 1f 00             	nopl   (%rax)
  31:	c3                   	ret
  32:	cc                   	int3
  33:	cc                   	int3
  34:	cc                   	int3
  35:	cc                   	int3
  36:	0f 1f 00             	nopl   (%rax)
  39:	90                   	nop
  3a:	90                   	nop
  3b:	90                   	nop
  3c:	90                   	nop
  3d:	90                   	nop
  3e:	90                   	nop
  3f:	90                   	nop

Code starting with the faulting instruction
===========================================
   0:	89 01                	mov    %eax,(%rcx)
   2:	31 c9                	xor    %ecx,%ecx
   4:	0f 1f 00             	nopl   (%rax)
   7:	c3                   	ret
   8:	cc                   	int3
   9:	cc                   	int3
   a:	cc                   	int3
   b:	cc                   	int3
   c:	0f 1f 00             	nopl   (%rax)
   f:	90                   	nop
  10:	90                   	nop
  11:	90                   	nop
  12:	90                   	nop
  13:	90                   	nop
  14:	90                   	nop
  15:	90                   	nop
[  189.214257][T15284] RSP: 0000:ffff88812fc0fd70 EFLAGS: 00010206
[  189.214726][T15284] RAX: 0000000020080522 RBX: 0000000000000000 RCX: 00000000ff7fffff
[  189.215167][T15284] RDX: 1ffff1102bfd712e RSI: 1ffff11075e09456 RDI: ffff88815feb8970
[  189.215605][T15284] RBP: 00000000ff7fffff R08: ffff88815feb8978 R09: fffffbfff50eecad
[  189.216059][T15284] R10: ffffffffa877656f R11: 0000000000000000 R12: 1ffff11025f81faf
[  189.216637][T15284] R13: ffff88812fc0fe30 R14: 00000000000000b9 R15: 0000000000000000
[189.217074][T15284] cap_validate_magic (kernel/capability.c:94) 
[189.217349][T15284] ? __pfx_cap_validate_magic (kernel/capability.c:76) 
[189.217654][T15284] __do_sys_capset (kernel/capability.c:230) 
[189.217925][T15284] ? __pfx___do_sys_capset (kernel/capability.c:221) 
[189.218226][T15284] do_int80_emulation (arch/x86/entry/common.c:165 arch/x86/entry/common.c:253) 
[189.218497][T15284] asm_int80_emulation (arch/x86/include/asm/idtentry.h:626) 
[  189.218871][T15284] RIP: 0023:0xf7f30092
[ 189.219200][T15284] Code: 00 00 00 e9 90 ff ff ff ff a3 24 00 00 00 68 30 00 00 00 e9 80 ff ff ff ff a3 f8 ff ff ff 66 90 00 00 00 00 00 00 00 00 cd 80 <c3> 8d b4 26 00 00 00 00 8d b6 00 00 00 00 8b 1c 24 c3 8d b4 26 00
All code
========
   0:	00 00                	add    %al,(%rax)
   2:	00 e9                	add    %ch,%cl
   4:	90                   	nop
   5:	ff                   	(bad)
   6:	ff                   	(bad)
   7:	ff                   	(bad)
   8:	ff a3 24 00 00 00    	jmp    *0x24(%rbx)
   e:	68 30 00 00 00       	push   $0x30
  13:	e9 80 ff ff ff       	jmp    0xffffffffffffff98
  18:	ff a3 f8 ff ff ff    	jmp    *-0x8(%rbx)
  1e:	66 90                	xchg   %ax,%ax
	...
  28:	cd 80                	int    $0x80
  2a:*	c3                   	ret		<-- trapping instruction
  2b:	8d b4 26 00 00 00 00 	lea    0x0(%rsi,%riz,1),%esi
  32:	8d b6 00 00 00 00    	lea    0x0(%rsi),%esi
  38:	8b 1c 24             	mov    (%rsp),%ebx
  3b:	c3                   	ret
  3c:	8d                   	.byte 0x8d
  3d:	b4 26                	mov    $0x26,%ah
	...

Code starting with the faulting instruction
===========================================
   0:	c3                   	ret
   1:	8d b4 26 00 00 00 00 	lea    0x0(%rsi,%riz,1),%esi
   8:	8d b6 00 00 00 00    	lea    0x0(%rsi),%esi
   e:	8b 1c 24             	mov    (%rsp),%ebx
  11:	c3                   	ret
  12:	8d                   	.byte 0x8d
  13:	b4 26                	mov    $0x26,%ah


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240903/202409031602.14479174-lkp@intel.com
Dev Jain Sept. 3, 2024, 12:44 p.m. UTC | #2
On 9/3/24 14:15, kernel test robot wrote:
>
> Hello,
>
> kernel test robot noticed "BUG:sleeping_function_called_from_invalid_context_at_mm/memory.c" on:
>
> commit: c636ba74f021bfe8d72845f9e53ee2b8ea16f5f8 ("[PATCH 2/2] mm: Allocate THP on hugezeropage wp-fault")
> url: https://github.com/intel-lab-lkp/linux/commits/Dev-Jain/mm-Abstract-THP-allocation/20240830-164300
> base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
> patch link: https://lore.kernel.org/all/20240830084117.4079805-3-dev.jain@arm.com/
> patch subject: [PATCH 2/2] mm: Allocate THP on hugezeropage wp-fault
>
> in testcase: trinity
> version: trinity-i386-abe9de86-1_20230429
> with following parameters:
>
> 	runtime: 600s
>
>
>
> compiler: gcc-12
> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>
> (please refer to attached dmesg/kmsg for entire log/backtrace)
>
>
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <oliver.sang@intel.com>
> | Closes: https://lore.kernel.org/oe-lkp/202409031602.14479174-lkp@intel.com
>
>
> [  189.202955][T15284] BUG: sleeping function called from invalid context at mm/memory.c:6690
> [  189.203611][T15284] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15284, name: trinity-c6

I guess, since thp_fault_alloc() may sleep due to folio_zero_user(), I need to call it before
do_huge_zero_wp_pmd_locked(). I will send a v2 since I also need to clean up the lock/unlock sites.

>
>
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..375dba4fb130 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -9,6 +9,13 @@ 
 #include <linux/kobject.h>
 
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+			   unsigned long haddr, struct folio **foliop,
+			   unsigned long addr);
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+		 struct vm_area_struct *vma, unsigned long haddr,
+		 pgtable_t pgtable)
+	__releases(vmf->ptl);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e5b568e2bb34..0f8b2e224795 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -943,9 +943,9 @@  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
-static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
-				  unsigned long haddr, struct folio **foliop,
-				  unsigned long addr)
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+			   unsigned long haddr, struct folio **foliop,
+			   unsigned long addr)
 {
 	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
 
@@ -984,22 +984,30 @@  static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
 	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
 }
 
-static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
-			struct vm_area_struct *vma, unsigned long haddr,
-			pgtable_t pgtable)
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+		 struct vm_area_struct *vma, unsigned long haddr,
+		 pgtable_t pgtable)
 	__releases(vmf->ptl)
 {
-	pmd_t entry;
+	pmd_t entry, old_pmd;
+	bool is_pmd_none = pmd_none(*vmf->pmd);
 
 	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
 	folio_add_lru_vma(folio, vma);
-	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+	if (!is_pmd_none) {
+		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+		if (pmd_uffd_wp(old_pmd))
+			entry = pmd_mkuffd_wp(entry);
+	}
+	if (pgtable)
+		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	mm_inc_nr_ptes(vma->vm_mm);
+	if (is_pmd_none)
+		mm_inc_nr_ptes(vma->vm_mm);
 	spin_unlock(vmf->ptl);
 	__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
 }
@@ -1577,6 +1585,47 @@  void huge_pmd_set_accessed(struct vm_fault *vmf)
 	spin_unlock(vmf->ptl);
 }
 
+static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
+					     unsigned long haddr)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+	struct folio *folio = NULL;
+	vm_fault_t ret;
+
+	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
+			      vmf->address);
+	if (ret)
+		goto unlock;
+	ret = check_stable_address_space(vma->vm_mm);
+	if (ret)
+		goto unlock;
+	map_pmd_thp(folio, vmf, vma, haddr, NULL);
+	return 0;
+
+unlock:
+	spin_unlock(vmf->ptl);
+	return ret;
+}
+
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct mmu_notifier_range range;
+	vm_fault_t ret = 0;
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+				haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	if (likely(pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+		ret = do_huge_zero_wp_pmd_locked(vmf, haddr);
+	else
+		spin_unlock(vmf->ptl);
+	mmu_notifier_invalidate_range_end(&range);
+	return ret;
+}
+
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 {
 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1589,8 +1638,15 @@  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
 
-	if (is_huge_zero_pmd(orig_pmd))
+	if (is_huge_zero_pmd(orig_pmd)) {
+		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
+
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+
+		/* Fallback to splitting PMD if THP cannot be allocated */
 		goto fallback;
+	}
 
 	spin_lock(vmf->ptl);
 
diff --git a/mm/memory.c b/mm/memory.c
index 3c01d68065be..c081a25f5173 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5409,9 +5409,10 @@  static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma)) {
 		if (likely(!unshare) &&
 		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
-			if (userfaultfd_wp_async(vmf->vma))
+			if (!userfaultfd_wp_async(vmf->vma))
+				return handle_userfault(vmf, VM_UFFD_WP);
+			if (!is_huge_zero_pmd(vmf->orig_pmd))
 				goto split;
-			return handle_userfault(vmf, VM_UFFD_WP);
 		}
 		return do_huge_pmd_wp_page(vmf);
 	}