diff mbox series

[v2,8/9] hugetlb: use new vma_lock for pmd sharing synchronization

Message ID 20220914221810.95771-9-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series hugetlb: Use new vma lock for huge pmd sharing synchronization | expand

Commit Message

Mike Kravetz Sept. 14, 2022, 10:18 p.m. UTC
The new hugetlb vma lock is used to address this race:

Faulting thread                                 Unsharing thread
...                                                  ...
ptep = huge_pte_offset()
      or
ptep = huge_pte_alloc()
...
                                                i_mmap_lock_write
                                                lock page table
ptep invalid   <------------------------        huge_pmd_unshare()
Could be in a previously                        unlock_page_table
sharing process or worse                        i_mmap_unlock_write
...

The vma_lock is used as follows:
- During fault processing. The lock is acquired in read mode before
  doing a page table lock and allocation (huge_pte_alloc).  The lock is
  held until code is finished with the page table entry (ptep).
- The lock must be held in write mode whenever huge_pmd_unshare is
  called.

Lock ordering issues come into play when unmapping a page from all
vmas mapping the page.  The i_mmap_rwsem must be held to search for the
vmas, and the vma lock must be held before calling unmap which will
call huge_pmd_unshare.  This is done today in:
- try_to_migrate_one and try_to_unmap_ for page migration and memory
  error handling.  In these routines we 'try' to obtain the vma lock and
  fail to unmap if unsuccessful.  Calling routines already deal with the
  failure of unmapping.
- hugetlb_vmdelete_list for truncation and hole punch.  This routine
  also tries to acquire the vma lock.  If it fails, it skips the
  unmapping.  However, we can not have file truncation or hole punch
  fail because of contention.  After hugetlb_vmdelete_list, truncation
  and hole punch call remove_inode_hugepages.  remove_inode_hugepages
  checks for mapped pages and call hugetlb_unmap_file_page to unmap them.
  hugetlb_unmap_file_page is designed to drop locks and reacquire in the
  correct order to guarantee unmap success.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c |  66 +++++++++++++++++++++++++++-
 mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
 mm/memory.c          |   2 +
 mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
 mm/userfaultfd.c     |   9 +++-
 5 files changed, 233 insertions(+), 46 deletions(-)

Comments

Miaohe Lin Sept. 29, 2022, 6:08 a.m. UTC | #1
On 2022/9/15 6:18, Mike Kravetz wrote:
> The new hugetlb vma lock is used to address this race:
> 
> Faulting thread                                 Unsharing thread
> ...                                                  ...
> ptep = huge_pte_offset()
>       or
> ptep = huge_pte_alloc()
> ...
>                                                 i_mmap_lock_write
>                                                 lock page table
> ptep invalid   <------------------------        huge_pmd_unshare()
> Could be in a previously                        unlock_page_table
> sharing process or worse                        i_mmap_unlock_write
> ...
> 
> The vma_lock is used as follows:
> - During fault processing. The lock is acquired in read mode before
>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
>   held until code is finished with the page table entry (ptep).
> - The lock must be held in write mode whenever huge_pmd_unshare is
>   called.
> 
> Lock ordering issues come into play when unmapping a page from all
> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
> vmas, and the vma lock must be held before calling unmap which will
> call huge_pmd_unshare.  This is done today in:
> - try_to_migrate_one and try_to_unmap_ for page migration and memory
>   error handling.  In these routines we 'try' to obtain the vma lock and
>   fail to unmap if unsuccessful.  Calling routines already deal with the
>   failure of unmapping.
> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
>   also tries to acquire the vma lock.  If it fails, it skips the
>   unmapping.  However, we can not have file truncation or hole punch
>   fail because of contention.  After hugetlb_vmdelete_list, truncation
>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
>   checks for mapped pages and call hugetlb_unmap_file_page to unmap them.
>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
>   correct order to guarantee unmap success.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  fs/hugetlbfs/inode.c |  66 +++++++++++++++++++++++++++-
>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
>  mm/memory.c          |   2 +
>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
>  mm/userfaultfd.c     |   9 +++-
>  5 files changed, 233 insertions(+), 46 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 3bb1772fce2f..009ae539b9b2 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>  					struct folio *folio, pgoff_t index)
>  {
>  	struct rb_root_cached *root = &mapping->i_mmap;
> +	struct hugetlb_vma_lock *vma_lock;
>  	struct page *page = &folio->page;
>  	struct vm_area_struct *vma;
>  	unsigned long v_start;
> @@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>  	end = (index + 1) * pages_per_huge_page(h);
>  
>  	i_mmap_lock_write(mapping);
> -
> +retry:
> +	vma_lock = NULL;
>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
>  		v_start = vma_offset_start(vma, start);
>  		v_end = vma_offset_end(vma, end);
> @@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
>  			continue;
>  
> +		if (!hugetlb_vma_trylock_write(vma)) {
> +			vma_lock = vma->vm_private_data;
> +			/*
> +			 * If we can not get vma lock, we need to drop
> +			 * immap_sema and take locks in order.  First,
> +			 * take a ref on the vma_lock structure so that
> +			 * we can be guaranteed it will not go away when
> +			 * dropping immap_sema.
> +			 */
> +			kref_get(&vma_lock->refs);
> +			break;
> +		}
> +
>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>  				NULL, ZAP_FLAG_DROP_MARKER);
> +		hugetlb_vma_unlock_write(vma);
>  	}
>  
>  	i_mmap_unlock_write(mapping);
> +
> +	if (vma_lock) {
> +		/*
> +		 * Wait on vma_lock.  We know it is still valid as we have
> +		 * a reference.  We must 'open code' vma locking as we do
> +		 * not know if vma_lock is still attached to vma.
> +		 */
> +		down_write(&vma_lock->rw_sema);
> +		i_mmap_lock_write(mapping);
> +
> +		vma = vma_lock->vma;
> +		if (!vma) {

Thanks Mike. This method looks much simpler. But IIUC, this code can race with exit_mmap:

CPU 1					CPU 2
hugetlb_unmap_file_folio		exit_mmap
  kref_get(&vma_lock->refs);
  down_write(&vma_lock->rw_sema);
					  free_pgtables // i_mmap_lock_write is held inside it.
  i_mmap_lock_write(mapping);
  vma = vma_lock->vma;
					  remove_vma
					    hugetlb_vm_op_close
					      hugetlb_vma_lock_free
					        vma_lock->vma = NULL;
					    vm_area_free(vma);
  vma is used-after-free??

The root casue is free_pgtables is protected with i_mmap_lock_write while remove_vma is not.
Or am I miss something again? ;)

> +			/*
> +			 * If lock is no longer attached to vma, then just
> +			 * unlock, drop our reference and retry looking for
> +			 * other vmas.
> +			 */
> +			up_write(&vma_lock->rw_sema);
> +			kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
> +			goto retry;
> +		}
> +
> +		/*
> +		 * vma_lock is still attached to vma.  Check to see if vma
> +		 * still maps page and if so, unmap.
> +		 */
> +		v_start = vma_offset_start(vma, start);
> +		v_end = vma_offset_end(vma, end);
> +		if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
> +			unmap_hugepage_range(vma, vma->vm_start + v_start,
> +						v_end, NULL,
> +						ZAP_FLAG_DROP_MARKER);
> +
> +		kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
> +		hugetlb_vma_unlock_write(vma);
> +
> +		goto retry;
> +	}
>  }
>  
>  static void
> @@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
>  		unsigned long v_start;
>  		unsigned long v_end;
>  
> +		if (!hugetlb_vma_trylock_write(vma))
> +			continue;
> +
>  		v_start = vma_offset_start(vma, start);
>  		v_end = vma_offset_end(vma, end);
>  
>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>  				     NULL, zap_flags);
> +
> +		/*
> +		 * Note that vma lock only exists for shared/non-private
> +		 * vmas.  Therefore, lock is not held when calling
> +		 * unmap_hugepage_range for private vmas.
> +		 */
> +		hugetlb_vma_unlock_write(vma);
>  	}
>  }
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 616be891b798..e8cbc0f7cdaa 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4795,6 +4795,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>  		mmu_notifier_invalidate_range_start(&range);
>  		mmap_assert_write_locked(src);
>  		raw_write_seqcount_begin(&src->write_protect_seq);
> +	} else {
> +		/*
> +		 * For shared mappings the vma lock must be held before
> +		 * calling huge_pte_offset in the src vma. Otherwise, the
> +		 * returned ptep could go away if part of a shared pmd and
> +		 * another thread calls huge_pmd_unshare.
> +		 */
> +		hugetlb_vma_lock_read(src_vma);
>  	}
>  
>  	last_addr_mask = hugetlb_mask_last_page(h);
> @@ -4941,6 +4949,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>  	if (cow) {
>  		raw_write_seqcount_end(&src->write_protect_seq);
>  		mmu_notifier_invalidate_range_end(&range);
> +	} else {
> +		hugetlb_vma_unlock_read(src_vma);
>  	}
>  
>  	return ret;
> @@ -4999,6 +5009,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>  	mmu_notifier_invalidate_range_start(&range);
>  	last_addr_mask = hugetlb_mask_last_page(h);
>  	/* Prevent race with file truncation */
> +	hugetlb_vma_lock_write(vma);
>  	i_mmap_lock_write(mapping);
>  	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
>  		src_pte = huge_pte_offset(mm, old_addr, sz);
> @@ -5030,6 +5041,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>  		flush_tlb_range(vma, old_end - len, old_end);
>  	mmu_notifier_invalidate_range_end(&range);
>  	i_mmap_unlock_write(mapping);
> +	hugetlb_vma_unlock_write(vma);
>  
>  	return len + old_addr - old_end;
>  }
> @@ -5349,8 +5361,29 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 * may get SIGKILLed if it later faults.
>  		 */
>  		if (outside_reserve) {
> +			struct address_space *mapping = vma->vm_file->f_mapping;
> +			pgoff_t idx;
> +			u32 hash;
> +
>  			put_page(old_page);
> +			/*
> +			 * Drop hugetlb_fault_mutex and vma_lock before
> +			 * unmapping.  unmapping needs to hold vma_lock
> +			 * in write mode.  Dropping vma_lock in read mode
> +			 * here is OK as COW mappings do not interact with
> +			 * PMD sharing.
> +			 *
> +			 * Reacquire both after unmap operation.
> +			 */
> +			idx = vma_hugecache_offset(h, vma, haddr);
> +			hash = hugetlb_fault_mutex_hash(mapping, idx);
> +			hugetlb_vma_unlock_read(vma);
> +			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> +
>  			unmap_ref_private(mm, vma, old_page, haddr);
> +
> +			mutex_lock(&hugetlb_fault_mutex_table[hash]);
> +			hugetlb_vma_lock_read(vma);
>  			spin_lock(ptl);
>  			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
>  			if (likely(ptep &&
> @@ -5499,14 +5532,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
>  	};
>  
>  	/*
> -	 * hugetlb_fault_mutex and i_mmap_rwsem must be
> +	 * vma_lock and hugetlb_fault_mutex must be
>  	 * dropped before handling userfault.  Reacquire
>  	 * after handling fault to make calling code simpler.
>  	 */
> +	hugetlb_vma_unlock_read(vma);
>  	hash = hugetlb_fault_mutex_hash(mapping, idx);
>  	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>  	ret = handle_userfault(&vmf, reason);
>  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
> +	hugetlb_vma_lock_read(vma);
>  
>  	return ret;
>  }
> @@ -5740,6 +5775,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  
>  	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
>  	if (ptep) {
> +		/*
> +		 * Since we hold no locks, ptep could be stale.  That is
> +		 * OK as we are only making decisions based on content and
> +		 * not actually modifying content here.
> +		 */
>  		entry = huge_ptep_get(ptep);
>  		if (unlikely(is_hugetlb_entry_migration(entry))) {
>  			migration_entry_wait_huge(vma, ptep);
> @@ -5747,23 +5787,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
>  			return VM_FAULT_HWPOISON_LARGE |
>  				VM_FAULT_SET_HINDEX(hstate_index(h));
> -	} else {
> -		ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
> -		if (!ptep)
> -			return VM_FAULT_OOM;
>  	}
>  
> -	mapping = vma->vm_file->f_mapping;
> -	idx = vma_hugecache_offset(h, vma, haddr);
> -
>  	/*
>  	 * Serialize hugepage allocation and instantiation, so that we don't
>  	 * get spurious allocation failures if two CPUs race to instantiate
>  	 * the same page in the page cache.
>  	 */
> +	mapping = vma->vm_file->f_mapping;
> +	idx = vma_hugecache_offset(h, vma, haddr);
>  	hash = hugetlb_fault_mutex_hash(mapping, idx);
>  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
>  
> +	/*
> +	 * Acquire vma lock before calling huge_pte_alloc and hold
> +	 * until finished with ptep.  This prevents huge_pmd_unshare from
> +	 * being called elsewhere and making the ptep no longer valid.
> +	 *
> +	 * ptep could have already be assigned via huge_pte_offset.  That
> +	 * is OK, as huge_pte_alloc will return the same value unless
> +	 * something has changed.
> +	 */
> +	hugetlb_vma_lock_read(vma);

[1] says vma_lock for each vma mapping the file provides the same type of synchronization
around i_size as provided by the fault mutex. But what if vma->vm_private_data is NULL,
i.e. hugetlb_vma_lock_alloc fails to alloc vma_lock? There won't be such synchronization
in this case.

[1] https://lore.kernel.org/lkml/Yxiv0SkMkZ0JWGGp@monkey/#t


Other parts of the patch look good to me. Thanks for your work.

Thanks,
Miaohe Lin


> +	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
> +	if (!ptep) {
> +		hugetlb_vma_unlock_read(vma);
> +		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> +		return VM_FAULT_OOM;
> +	}
> +
>  	entry = huge_ptep_get(ptep);
>  	/* PTE markers should be handled the same way as none pte */
>  	if (huge_pte_none_mostly(entry)) {
> @@ -5824,6 +5876,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unlock_page(pagecache_page);
>  			put_page(pagecache_page);
>  		}
> +		hugetlb_vma_unlock_read(vma);
>  		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>  		return handle_userfault(&vmf, VM_UFFD_WP);
>  	}
> @@ -5867,6 +5920,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		put_page(pagecache_page);
>  	}
>  out_mutex:
> +	hugetlb_vma_unlock_read(vma);
>  	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>  	/*
>  	 * Generally it's safe to hold refcount during waiting page lock. But
> @@ -6329,8 +6383,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>  	flush_cache_range(vma, range.start, range.end);
>  
>  	mmu_notifier_invalidate_range_start(&range);
> -	last_addr_mask = hugetlb_mask_last_page(h);
> +	hugetlb_vma_lock_write(vma);
>  	i_mmap_lock_write(vma->vm_file->f_mapping);
> +	last_addr_mask = hugetlb_mask_last_page(h);
>  	for (; address < end; address += psize) {
>  		spinlock_t *ptl;
>  		ptep = huge_pte_offset(mm, address, psize);
> @@ -6429,6 +6484,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>  	 * See Documentation/mm/mmu_notifier.rst
>  	 */
>  	i_mmap_unlock_write(vma->vm_file->f_mapping);
> +	hugetlb_vma_unlock_write(vma);
>  	mmu_notifier_invalidate_range_end(&range);
>  
>  	return pages << h->order;
> @@ -6930,6 +6986,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
>  	pud_t *pud = pud_offset(p4d, addr);
>  
>  	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
> +	hugetlb_vma_assert_locked(vma);
>  	BUG_ON(page_count(virt_to_page(ptep)) == 0);
>  	if (page_count(virt_to_page(ptep)) == 1)
>  		return 0;
> @@ -6941,6 +6998,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
>  }
>  
>  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
> +void hugetlb_vma_lock_read(struct vm_area_struct *vma)
> +{
> +}
> +
> +void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
> +{
> +}
> +
> +void hugetlb_vma_lock_write(struct vm_area_struct *vma)
> +{
> +}
> +
> +void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
> +{
> +}
> +
> +int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
> +{
> +	return 1;
> +}
> +
> +void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
> +{
> +}
> +
>  static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>  {
>  }
> @@ -7318,6 +7400,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
>  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
>  				start, end);
>  	mmu_notifier_invalidate_range_start(&range);
> +	hugetlb_vma_lock_write(vma);
>  	i_mmap_lock_write(vma->vm_file->f_mapping);
>  	for (address = start; address < end; address += PUD_SIZE) {
>  		ptep = huge_pte_offset(mm, address, sz);
> @@ -7329,6 +7412,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
>  	}
>  	flush_hugetlb_tlb_range(vma, start, end);
>  	i_mmap_unlock_write(vma->vm_file->f_mapping);
> +	hugetlb_vma_unlock_write(vma);
>  	/*
>  	 * No need to call mmu_notifier_invalidate_range(), see
>  	 * Documentation/mm/mmu_notifier.rst.
> diff --git a/mm/memory.c b/mm/memory.c
> index c4c3c2fd4f45..118e5f023597 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1685,10 +1685,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
>  			if (vma->vm_file) {
>  				zap_flags_t zap_flags = details ?
>  				    details->zap_flags : 0;
> +				hugetlb_vma_lock_write(vma);
>  				i_mmap_lock_write(vma->vm_file->f_mapping);
>  				__unmap_hugepage_range_final(tlb, vma, start, end,
>  							     NULL, zap_flags);
>  				i_mmap_unlock_write(vma->vm_file->f_mapping);
> +				hugetlb_vma_unlock_write(vma);
>  			}
>  		} else
>  			unmap_page_range(tlb, vma, start, end, details);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 744faaef0489..2ec925e5fa6a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1554,24 +1554,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  			 * To call huge_pmd_unshare, i_mmap_rwsem must be
>  			 * held in write mode.  Caller needs to explicitly
>  			 * do this outside rmap routines.
> +			 *
> +			 * We also must hold hugetlb vma_lock in write mode.
> +			 * Lock order dictates acquiring vma_lock BEFORE
> +			 * i_mmap_rwsem.  We can only try lock here and fail
> +			 * if unsuccessful.
>  			 */
> -			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
> -			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> -				flush_tlb_range(vma, range.start, range.end);
> -				mmu_notifier_invalidate_range(mm, range.start,
> -							      range.end);
> -
> -				/*
> -				 * The ref count of the PMD page was dropped
> -				 * which is part of the way map counting
> -				 * is done for shared PMDs.  Return 'true'
> -				 * here.  When there is no other sharing,
> -				 * huge_pmd_unshare returns false and we will
> -				 * unmap the actual page and drop map count
> -				 * to zero.
> -				 */
> -				page_vma_mapped_walk_done(&pvmw);
> -				break;
> +			if (!anon) {
> +				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> +				if (!hugetlb_vma_trylock_write(vma)) {
> +					page_vma_mapped_walk_done(&pvmw);
> +					ret = false;
> +					break;
> +				}
> +				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> +					hugetlb_vma_unlock_write(vma);
> +					flush_tlb_range(vma,
> +						range.start, range.end);
> +					mmu_notifier_invalidate_range(mm,
> +						range.start, range.end);
> +					/*
> +					 * The ref count of the PMD page was
> +					 * dropped which is part of the way map
> +					 * counting is done for shared PMDs.
> +					 * Return 'true' here.  When there is
> +					 * no other sharing, huge_pmd_unshare
> +					 * returns false and we will unmap the
> +					 * actual page and drop map count
> +					 * to zero.
> +					 */
> +					page_vma_mapped_walk_done(&pvmw);
> +					break;
> +				}
> +				hugetlb_vma_unlock_write(vma);
>  			}
>  			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
>  		} else {
> @@ -1929,26 +1944,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>  			 * To call huge_pmd_unshare, i_mmap_rwsem must be
>  			 * held in write mode.  Caller needs to explicitly
>  			 * do this outside rmap routines.
> +			 *
> +			 * We also must hold hugetlb vma_lock in write mode.
> +			 * Lock order dictates acquiring vma_lock BEFORE
> +			 * i_mmap_rwsem.  We can only try lock here and
> +			 * fail if unsuccessful.
>  			 */
> -			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
> -			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> -				flush_tlb_range(vma, range.start, range.end);
> -				mmu_notifier_invalidate_range(mm, range.start,
> -							      range.end);
> -
> -				/*
> -				 * The ref count of the PMD page was dropped
> -				 * which is part of the way map counting
> -				 * is done for shared PMDs.  Return 'true'
> -				 * here.  When there is no other sharing,
> -				 * huge_pmd_unshare returns false and we will
> -				 * unmap the actual page and drop map count
> -				 * to zero.
> -				 */
> -				page_vma_mapped_walk_done(&pvmw);
> -				break;
> +			if (!anon) {
> +				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> +				if (!hugetlb_vma_trylock_write(vma)) {
> +					page_vma_mapped_walk_done(&pvmw);
> +					ret = false;
> +					break;
> +				}
> +				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> +					hugetlb_vma_unlock_write(vma);
> +					flush_tlb_range(vma,
> +						range.start, range.end);
> +					mmu_notifier_invalidate_range(mm,
> +						range.start, range.end);
> +
> +					/*
> +					 * The ref count of the PMD page was
> +					 * dropped which is part of the way map
> +					 * counting is done for shared PMDs.
> +					 * Return 'true' here.  When there is
> +					 * no other sharing, huge_pmd_unshare
> +					 * returns false and we will unmap the
> +					 * actual page and drop map count
> +					 * to zero.
> +					 */
> +					page_vma_mapped_walk_done(&pvmw);
> +					break;
> +				}
> +				hugetlb_vma_unlock_write(vma);
>  			}
> -
>  			/* Nuke the hugetlb page table entry */
>  			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
>  		} else {
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 0fdbd2c05587..e24e8a47ce8a 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -379,16 +379,21 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>  		BUG_ON(dst_addr >= dst_start + len);
>  
>  		/*
> -		 * Serialize via hugetlb_fault_mutex.
> +		 * Serialize via vma_lock and hugetlb_fault_mutex.
> +		 * vma_lock ensures the dst_pte remains valid even
> +		 * in the case of shared pmds.  fault mutex prevents
> +		 * races with other faulting threads.
>  		 */
>  		idx = linear_page_index(dst_vma, dst_addr);
>  		mapping = dst_vma->vm_file->f_mapping;
>  		hash = hugetlb_fault_mutex_hash(mapping, idx);
>  		mutex_lock(&hugetlb_fault_mutex_table[hash]);
> +		hugetlb_vma_lock_read(dst_vma);
>  
>  		err = -ENOMEM;
>  		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
>  		if (!dst_pte) {
> +			hugetlb_vma_unlock_read(dst_vma);
>  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>  			goto out_unlock;
>  		}
> @@ -396,6 +401,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>  		if (mode != MCOPY_ATOMIC_CONTINUE &&
>  		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
>  			err = -EEXIST;
> +			hugetlb_vma_unlock_read(dst_vma);
>  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>  			goto out_unlock;
>  		}
> @@ -404,6 +410,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
>  					       dst_addr, src_addr, mode, &page,
>  					       wp_copy);
>  
> +		hugetlb_vma_unlock_read(dst_vma);
>  		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
>  
>  		cond_resched();
>
Mike Kravetz Oct. 1, 2022, midnight UTC | #2
On 09/29/22 14:08, Miaohe Lin wrote:
> On 2022/9/15 6:18, Mike Kravetz wrote:
> > @@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >  					struct folio *folio, pgoff_t index)
> >  {
> >  	struct rb_root_cached *root = &mapping->i_mmap;
> > +	struct hugetlb_vma_lock *vma_lock;
> >  	struct page *page = &folio->page;
> >  	struct vm_area_struct *vma;
> >  	unsigned long v_start;
> > @@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >  	end = (index + 1) * pages_per_huge_page(h);
> >  
> >  	i_mmap_lock_write(mapping);
> > -
> > +retry:
> > +	vma_lock = NULL;
> >  	vma_interval_tree_foreach(vma, root, start, end - 1) {
> >  		v_start = vma_offset_start(vma, start);
> >  		v_end = vma_offset_end(vma, end);
> > @@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
> >  			continue;
> >  
> > +		if (!hugetlb_vma_trylock_write(vma)) {
> > +			vma_lock = vma->vm_private_data;
> > +			/*
> > +			 * If we can not get vma lock, we need to drop
> > +			 * immap_sema and take locks in order.  First,
> > +			 * take a ref on the vma_lock structure so that
> > +			 * we can be guaranteed it will not go away when
> > +			 * dropping immap_sema.
> > +			 */
> > +			kref_get(&vma_lock->refs);
> > +			break;
> > +		}
> > +
> >  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> >  				NULL, ZAP_FLAG_DROP_MARKER);
> > +		hugetlb_vma_unlock_write(vma);
> >  	}
> >  
> >  	i_mmap_unlock_write(mapping);
> > +
> > +	if (vma_lock) {
> > +		/*
> > +		 * Wait on vma_lock.  We know it is still valid as we have
> > +		 * a reference.  We must 'open code' vma locking as we do
> > +		 * not know if vma_lock is still attached to vma.
> > +		 */
> > +		down_write(&vma_lock->rw_sema);
> > +		i_mmap_lock_write(mapping);
> > +
> > +		vma = vma_lock->vma;
> > +		if (!vma) {
> 
> Thanks Mike. This method looks much simpler. But IIUC, this code can race with exit_mmap:
> 
> CPU 1					CPU 2
> hugetlb_unmap_file_folio		exit_mmap
>   kref_get(&vma_lock->refs);
>   down_write(&vma_lock->rw_sema);
> 					  free_pgtables // i_mmap_lock_write is held inside it.
>   i_mmap_lock_write(mapping);
>   vma = vma_lock->vma;
> 					  remove_vma
> 					    hugetlb_vm_op_close
> 					      hugetlb_vma_lock_free
> 					        vma_lock->vma = NULL;
> 					    vm_area_free(vma);
>   vma is used-after-free??
> 
> The root casue is free_pgtables is protected with i_mmap_lock_write while remove_vma is not.
> Or am I miss something again? ;)

Thank you Miaohe!  Sorry for the delay in responding.

Yes, I agree this is a possible race.  My first thought is that we may be
able to address this by simply taking the vma_lock when we clear the
vma_lock->vma field.  Something like this,

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4cb44a4629b8..bf0c220ebc32 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6918,7 +6918,9 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 		 * certainly will no longer be attached to vma so clear
 		 * pointer.
 		 */
+		down_write(&vma_lock->rw_sema);
 		vma_lock->vma = NULL;
+		up_write(&vma_lock->rw_sema);
 		kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
 		vma->vm_private_data = NULL;
 	}

I still need to do a bit more work to verify.

Andrew, if you are concerned I do not think this is a show stopper.  The
race should be extremely rare, and a fix should be coming quickly.

<snip>
> > +	mapping = vma->vm_file->f_mapping;
> > +	idx = vma_hugecache_offset(h, vma, haddr);
> >  	hash = hugetlb_fault_mutex_hash(mapping, idx);
> >  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
> >  
> > +	/*
> > +	 * Acquire vma lock before calling huge_pte_alloc and hold
> > +	 * until finished with ptep.  This prevents huge_pmd_unshare from
> > +	 * being called elsewhere and making the ptep no longer valid.
> > +	 *
> > +	 * ptep could have already be assigned via huge_pte_offset.  That
> > +	 * is OK, as huge_pte_alloc will return the same value unless
> > +	 * something has changed.
> > +	 */
> > +	hugetlb_vma_lock_read(vma);
> 
> [1] says vma_lock for each vma mapping the file provides the same type of synchronization
> around i_size as provided by the fault mutex. But what if vma->vm_private_data is NULL,
> i.e. hugetlb_vma_lock_alloc fails to alloc vma_lock? There won't be such synchronization
> in this case.
> 
> [1] https://lore.kernel.org/lkml/Yxiv0SkMkZ0JWGGp@monkey/#t
> 

Right.

Of course, this (checking i_size) only applies to shared/file mappings.
The only time hugetlb_vma_lock_alloc should fail in such cases is when
we can not allocate the small vma_lock structure.  Since the vma_lock
is primarily for huge pmd sharing synchronization, my thought was that
allocation errors would just prevent sharing.  But, as you point out
it could also impact these checks.

It would be easy to check for the lock allocation failure at mmap time
and fail the mmap.  It would be a little more tricky at fork time.

This also is something that is highly unlikely to occur.  And, if we
can't allocate a vma_lock I suspect we will not be up and running long
enough for this to be an issue. :) Let me think about the best way to handle.

> 
> Other parts of the patch look good to me. Thanks for your work.
> 

Thanks again,
Miaohe Lin Oct. 8, 2022, 2:29 a.m. UTC | #3
On 2022/10/1 8:00, Mike Kravetz wrote:
> On 09/29/22 14:08, Miaohe Lin wrote:
>> On 2022/9/15 6:18, Mike Kravetz wrote:
>>> @@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>  					struct folio *folio, pgoff_t index)
>>>  {
>>>  	struct rb_root_cached *root = &mapping->i_mmap;
>>> +	struct hugetlb_vma_lock *vma_lock;
>>>  	struct page *page = &folio->page;
>>>  	struct vm_area_struct *vma;
>>>  	unsigned long v_start;
>>> @@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>  	end = (index + 1) * pages_per_huge_page(h);
>>>  
>>>  	i_mmap_lock_write(mapping);
>>> -
>>> +retry:
>>> +	vma_lock = NULL;
>>>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
>>>  		v_start = vma_offset_start(vma, start);
>>>  		v_end = vma_offset_end(vma, end);
>>> @@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
>>>  			continue;
>>>  
>>> +		if (!hugetlb_vma_trylock_write(vma)) {
>>> +			vma_lock = vma->vm_private_data;
>>> +			/*
>>> +			 * If we can not get vma lock, we need to drop
>>> +			 * immap_sema and take locks in order.  First,
>>> +			 * take a ref on the vma_lock structure so that
>>> +			 * we can be guaranteed it will not go away when
>>> +			 * dropping immap_sema.
>>> +			 */
>>> +			kref_get(&vma_lock->refs);
>>> +			break;
>>> +		}
>>> +
>>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>>>  				NULL, ZAP_FLAG_DROP_MARKER);
>>> +		hugetlb_vma_unlock_write(vma);
>>>  	}
>>>  
>>>  	i_mmap_unlock_write(mapping);
>>> +
>>> +	if (vma_lock) {
>>> +		/*
>>> +		 * Wait on vma_lock.  We know it is still valid as we have
>>> +		 * a reference.  We must 'open code' vma locking as we do
>>> +		 * not know if vma_lock is still attached to vma.
>>> +		 */
>>> +		down_write(&vma_lock->rw_sema);
>>> +		i_mmap_lock_write(mapping);
>>> +
>>> +		vma = vma_lock->vma;
>>> +		if (!vma) {
>>
>> Thanks Mike. This method looks much simpler. But IIUC, this code can race with exit_mmap:
>>
>> CPU 1					CPU 2
>> hugetlb_unmap_file_folio		exit_mmap
>>   kref_get(&vma_lock->refs);
>>   down_write(&vma_lock->rw_sema);
>> 					  free_pgtables // i_mmap_lock_write is held inside it.
>>   i_mmap_lock_write(mapping);
>>   vma = vma_lock->vma;
>> 					  remove_vma
>> 					    hugetlb_vm_op_close
>> 					      hugetlb_vma_lock_free
>> 					        vma_lock->vma = NULL;
>> 					    vm_area_free(vma);
>>   vma is used-after-free??
>>
>> The root casue is free_pgtables is protected with i_mmap_lock_write while remove_vma is not.
>> Or am I miss something again? ;)
> 
> Thank you Miaohe!  Sorry for the delay in responding.
> 
> Yes, I agree this is a possible race.  My first thought is that we may be
> able to address this by simply taking the vma_lock when we clear the
> vma_lock->vma field.  Something like this,
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4cb44a4629b8..bf0c220ebc32 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6918,7 +6918,9 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
>  		 * certainly will no longer be attached to vma so clear
>  		 * pointer.
>  		 */
> +		down_write(&vma_lock->rw_sema);
>  		vma_lock->vma = NULL;
> +		up_write(&vma_lock->rw_sema);
>  		kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
>  		vma->vm_private_data = NULL;
>  	}

AFAICT, this should work. And we won't hold the vma_lock->rw_sema when delete vma, there
should not be a possible deadlock.

> 
> I still need to do a bit more work to verify.
> 
> Andrew, if you are concerned I do not think this is a show stopper.  The
> race should be extremely rare, and a fix should be coming quickly.

Agree, this should be really rare.

> 
> <snip>
>>> +	mapping = vma->vm_file->f_mapping;
>>> +	idx = vma_hugecache_offset(h, vma, haddr);
>>>  	hash = hugetlb_fault_mutex_hash(mapping, idx);
>>>  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
>>>  
>>> +	/*
>>> +	 * Acquire vma lock before calling huge_pte_alloc and hold
>>> +	 * until finished with ptep.  This prevents huge_pmd_unshare from
>>> +	 * being called elsewhere and making the ptep no longer valid.
>>> +	 *
>>> +	 * ptep could have already be assigned via huge_pte_offset.  That
>>> +	 * is OK, as huge_pte_alloc will return the same value unless
>>> +	 * something has changed.
>>> +	 */
>>> +	hugetlb_vma_lock_read(vma);
>>
>> [1] says vma_lock for each vma mapping the file provides the same type of synchronization
>> around i_size as provided by the fault mutex. But what if vma->vm_private_data is NULL,
>> i.e. hugetlb_vma_lock_alloc fails to alloc vma_lock? There won't be such synchronization
>> in this case.
>>
>> [1] https://lore.kernel.org/lkml/Yxiv0SkMkZ0JWGGp@monkey/#t
>>
> 
> Right.
> 
> Of course, this (checking i_size) only applies to shared/file mappings.
> The only time hugetlb_vma_lock_alloc should fail in such cases is when
> we can not allocate the small vma_lock structure.  Since the vma_lock
> is primarily for huge pmd sharing synchronization, my thought was that
> allocation errors would just prevent sharing.  But, as you point out
> it could also impact these checks.
> 
> It would be easy to check for the lock allocation failure at mmap time
> and fail the mmap.  It would be a little more tricky at fork time.
> 
> This also is something that is highly unlikely to occur.  And, if we
> can't allocate a vma_lock I suspect we will not be up and running long
> enough for this to be an issue. :) Let me think about the best way to handle.

Agree. This should be really rare too. Thanks for your work.

Thanks,
Miaohe Lin

> 
>>
>> Other parts of the patch look good to me. Thanks for your work.
>>
> 
> Thanks again,
>
diff mbox series

Patch

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3bb1772fce2f..009ae539b9b2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -434,6 +434,7 @@  static void hugetlb_unmap_file_folio(struct hstate *h,
 					struct folio *folio, pgoff_t index)
 {
 	struct rb_root_cached *root = &mapping->i_mmap;
+	struct hugetlb_vma_lock *vma_lock;
 	struct page *page = &folio->page;
 	struct vm_area_struct *vma;
 	unsigned long v_start;
@@ -444,7 +445,8 @@  static void hugetlb_unmap_file_folio(struct hstate *h,
 	end = (index + 1) * pages_per_huge_page(h);
 
 	i_mmap_lock_write(mapping);
-
+retry:
+	vma_lock = NULL;
 	vma_interval_tree_foreach(vma, root, start, end - 1) {
 		v_start = vma_offset_start(vma, start);
 		v_end = vma_offset_end(vma, end);
@@ -452,11 +454,63 @@  static void hugetlb_unmap_file_folio(struct hstate *h,
 		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
 			continue;
 
+		if (!hugetlb_vma_trylock_write(vma)) {
+			vma_lock = vma->vm_private_data;
+			/*
+			 * If we can not get vma lock, we need to drop
+			 * immap_sema and take locks in order.  First,
+			 * take a ref on the vma_lock structure so that
+			 * we can be guaranteed it will not go away when
+			 * dropping immap_sema.
+			 */
+			kref_get(&vma_lock->refs);
+			break;
+		}
+
 		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
 				NULL, ZAP_FLAG_DROP_MARKER);
+		hugetlb_vma_unlock_write(vma);
 	}
 
 	i_mmap_unlock_write(mapping);
+
+	if (vma_lock) {
+		/*
+		 * Wait on vma_lock.  We know it is still valid as we have
+		 * a reference.  We must 'open code' vma locking as we do
+		 * not know if vma_lock is still attached to vma.
+		 */
+		down_write(&vma_lock->rw_sema);
+		i_mmap_lock_write(mapping);
+
+		vma = vma_lock->vma;
+		if (!vma) {
+			/*
+			 * If lock is no longer attached to vma, then just
+			 * unlock, drop our reference and retry looking for
+			 * other vmas.
+			 */
+			up_write(&vma_lock->rw_sema);
+			kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+			goto retry;
+		}
+
+		/*
+		 * vma_lock is still attached to vma.  Check to see if vma
+		 * still maps page and if so, unmap.
+		 */
+		v_start = vma_offset_start(vma, start);
+		v_end = vma_offset_end(vma, end);
+		if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+			unmap_hugepage_range(vma, vma->vm_start + v_start,
+						v_end, NULL,
+						ZAP_FLAG_DROP_MARKER);
+
+		kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+		hugetlb_vma_unlock_write(vma);
+
+		goto retry;
+	}
 }
 
 static void
@@ -474,11 +528,21 @@  hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
 		unsigned long v_start;
 		unsigned long v_end;
 
+		if (!hugetlb_vma_trylock_write(vma))
+			continue;
+
 		v_start = vma_offset_start(vma, start);
 		v_end = vma_offset_end(vma, end);
 
 		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
 				     NULL, zap_flags);
+
+		/*
+		 * Note that vma lock only exists for shared/non-private
+		 * vmas.  Therefore, lock is not held when calling
+		 * unmap_hugepage_range for private vmas.
+		 */
+		hugetlb_vma_unlock_write(vma);
 	}
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 616be891b798..e8cbc0f7cdaa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4795,6 +4795,14 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		mmu_notifier_invalidate_range_start(&range);
 		mmap_assert_write_locked(src);
 		raw_write_seqcount_begin(&src->write_protect_seq);
+	} else {
+		/*
+		 * For shared mappings the vma lock must be held before
+		 * calling huge_pte_offset in the src vma. Otherwise, the
+		 * returned ptep could go away if part of a shared pmd and
+		 * another thread calls huge_pmd_unshare.
+		 */
+		hugetlb_vma_lock_read(src_vma);
 	}
 
 	last_addr_mask = hugetlb_mask_last_page(h);
@@ -4941,6 +4949,8 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	if (cow) {
 		raw_write_seqcount_end(&src->write_protect_seq);
 		mmu_notifier_invalidate_range_end(&range);
+	} else {
+		hugetlb_vma_unlock_read(src_vma);
 	}
 
 	return ret;
@@ -4999,6 +5009,7 @@  int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	mmu_notifier_invalidate_range_start(&range);
 	last_addr_mask = hugetlb_mask_last_page(h);
 	/* Prevent race with file truncation */
+	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(mapping);
 	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
 		src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -5030,6 +5041,7 @@  int move_hugetlb_page_tables(struct vm_area_struct *vma,
 		flush_tlb_range(vma, old_end - len, old_end);
 	mmu_notifier_invalidate_range_end(&range);
 	i_mmap_unlock_write(mapping);
+	hugetlb_vma_unlock_write(vma);
 
 	return len + old_addr - old_end;
 }
@@ -5349,8 +5361,29 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * may get SIGKILLed if it later faults.
 		 */
 		if (outside_reserve) {
+			struct address_space *mapping = vma->vm_file->f_mapping;
+			pgoff_t idx;
+			u32 hash;
+
 			put_page(old_page);
+			/*
+			 * Drop hugetlb_fault_mutex and vma_lock before
+			 * unmapping.  unmapping needs to hold vma_lock
+			 * in write mode.  Dropping vma_lock in read mode
+			 * here is OK as COW mappings do not interact with
+			 * PMD sharing.
+			 *
+			 * Reacquire both after unmap operation.
+			 */
+			idx = vma_hugecache_offset(h, vma, haddr);
+			hash = hugetlb_fault_mutex_hash(mapping, idx);
+			hugetlb_vma_unlock_read(vma);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
 			unmap_ref_private(mm, vma, old_page, haddr);
+
+			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+			hugetlb_vma_lock_read(vma);
 			spin_lock(ptl);
 			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 			if (likely(ptep &&
@@ -5499,14 +5532,16 @@  static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
 	};
 
 	/*
-	 * hugetlb_fault_mutex and i_mmap_rwsem must be
+	 * vma_lock and hugetlb_fault_mutex must be
 	 * dropped before handling userfault.  Reacquire
 	 * after handling fault to make calling code simpler.
 	 */
+	hugetlb_vma_unlock_read(vma);
 	hash = hugetlb_fault_mutex_hash(mapping, idx);
 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 	ret = handle_userfault(&vmf, reason);
 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
+	hugetlb_vma_lock_read(vma);
 
 	return ret;
 }
@@ -5740,6 +5775,11 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 	if (ptep) {
+		/*
+		 * Since we hold no locks, ptep could be stale.  That is
+		 * OK as we are only making decisions based on content and
+		 * not actually modifying content here.
+		 */
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
 			migration_entry_wait_huge(vma, ptep);
@@ -5747,23 +5787,35 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
 				VM_FAULT_SET_HINDEX(hstate_index(h));
-	} else {
-		ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
-		if (!ptep)
-			return VM_FAULT_OOM;
 	}
 
-	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, vma, haddr);
-
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, haddr);
 	hash = hugetlb_fault_mutex_hash(mapping, idx);
 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
+	/*
+	 * Acquire vma lock before calling huge_pte_alloc and hold
+	 * until finished with ptep.  This prevents huge_pmd_unshare from
+	 * being called elsewhere and making the ptep no longer valid.
+	 *
+	 * ptep could have already be assigned via huge_pte_offset.  That
+	 * is OK, as huge_pte_alloc will return the same value unless
+	 * something has changed.
+	 */
+	hugetlb_vma_lock_read(vma);
+	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
+	if (!ptep) {
+		hugetlb_vma_unlock_read(vma);
+		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+		return VM_FAULT_OOM;
+	}
+
 	entry = huge_ptep_get(ptep);
 	/* PTE markers should be handled the same way as none pte */
 	if (huge_pte_none_mostly(entry)) {
@@ -5824,6 +5876,7 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unlock_page(pagecache_page);
 			put_page(pagecache_page);
 		}
+		hugetlb_vma_unlock_read(vma);
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		return handle_userfault(&vmf, VM_UFFD_WP);
 	}
@@ -5867,6 +5920,7 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(pagecache_page);
 	}
 out_mutex:
+	hugetlb_vma_unlock_read(vma);
 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 	/*
 	 * Generally it's safe to hold refcount during waiting page lock. But
@@ -6329,8 +6383,9 @@  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, range.start, range.end);
 
 	mmu_notifier_invalidate_range_start(&range);
-	last_addr_mask = hugetlb_mask_last_page(h);
+	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	for (; address < end; address += psize) {
 		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address, psize);
@@ -6429,6 +6484,7 @@  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	hugetlb_vma_unlock_write(vma);
 	mmu_notifier_invalidate_range_end(&range);
 
 	return pages << h->order;
@@ -6930,6 +6986,7 @@  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud = pud_offset(p4d, addr);
 
 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+	hugetlb_vma_assert_locked(vma);
 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
 	if (page_count(virt_to_page(ptep)) == 1)
 		return 0;
@@ -6941,6 +6998,31 @@  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+	return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 {
 }
@@ -7318,6 +7400,7 @@  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 				start, end);
 	mmu_notifier_invalidate_range_start(&range);
+	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (address = start; address < end; address += PUD_SIZE) {
 		ptep = huge_pte_offset(mm, address, sz);
@@ -7329,6 +7412,7 @@  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	}
 	flush_hugetlb_tlb_range(vma, start, end);
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	hugetlb_vma_unlock_write(vma);
 	/*
 	 * No need to call mmu_notifier_invalidate_range(), see
 	 * Documentation/mm/mmu_notifier.rst.
diff --git a/mm/memory.c b/mm/memory.c
index c4c3c2fd4f45..118e5f023597 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1685,10 +1685,12 @@  static void unmap_single_vma(struct mmu_gather *tlb,
 			if (vma->vm_file) {
 				zap_flags_t zap_flags = details ?
 				    details->zap_flags : 0;
+				hugetlb_vma_lock_write(vma);
 				i_mmap_lock_write(vma->vm_file->f_mapping);
 				__unmap_hugepage_range_final(tlb, vma, start, end,
 							     NULL, zap_flags);
 				i_mmap_unlock_write(vma->vm_file->f_mapping);
+				hugetlb_vma_unlock_write(vma);
 			}
 		} else
 			unmap_page_range(tlb, vma, start, end, details);
diff --git a/mm/rmap.c b/mm/rmap.c
index 744faaef0489..2ec925e5fa6a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1554,24 +1554,39 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
 			 * held in write mode.  Caller needs to explicitly
 			 * do this outside rmap routines.
+			 *
+			 * We also must hold hugetlb vma_lock in write mode.
+			 * Lock order dictates acquiring vma_lock BEFORE
+			 * i_mmap_rwsem.  We can only try lock here and fail
+			 * if unsuccessful.
 			 */
-			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-				flush_tlb_range(vma, range.start, range.end);
-				mmu_notifier_invalidate_range(mm, range.start,
-							      range.end);
-
-				/*
-				 * The ref count of the PMD page was dropped
-				 * which is part of the way map counting
-				 * is done for shared PMDs.  Return 'true'
-				 * here.  When there is no other sharing,
-				 * huge_pmd_unshare returns false and we will
-				 * unmap the actual page and drop map count
-				 * to zero.
-				 */
-				page_vma_mapped_walk_done(&pvmw);
-				break;
+			if (!anon) {
+				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				if (!hugetlb_vma_trylock_write(vma)) {
+					page_vma_mapped_walk_done(&pvmw);
+					ret = false;
+					break;
+				}
+				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+					hugetlb_vma_unlock_write(vma);
+					flush_tlb_range(vma,
+						range.start, range.end);
+					mmu_notifier_invalidate_range(mm,
+						range.start, range.end);
+					/*
+					 * The ref count of the PMD page was
+					 * dropped which is part of the way map
+					 * counting is done for shared PMDs.
+					 * Return 'true' here.  When there is
+					 * no other sharing, huge_pmd_unshare
+					 * returns false and we will unmap the
+					 * actual page and drop map count
+					 * to zero.
+					 */
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+				hugetlb_vma_unlock_write(vma);
 			}
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
 		} else {
@@ -1929,26 +1944,41 @@  static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
 			 * held in write mode.  Caller needs to explicitly
 			 * do this outside rmap routines.
+			 *
+			 * We also must hold hugetlb vma_lock in write mode.
+			 * Lock order dictates acquiring vma_lock BEFORE
+			 * i_mmap_rwsem.  We can only try lock here and
+			 * fail if unsuccessful.
 			 */
-			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-				flush_tlb_range(vma, range.start, range.end);
-				mmu_notifier_invalidate_range(mm, range.start,
-							      range.end);
-
-				/*
-				 * The ref count of the PMD page was dropped
-				 * which is part of the way map counting
-				 * is done for shared PMDs.  Return 'true'
-				 * here.  When there is no other sharing,
-				 * huge_pmd_unshare returns false and we will
-				 * unmap the actual page and drop map count
-				 * to zero.
-				 */
-				page_vma_mapped_walk_done(&pvmw);
-				break;
+			if (!anon) {
+				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				if (!hugetlb_vma_trylock_write(vma)) {
+					page_vma_mapped_walk_done(&pvmw);
+					ret = false;
+					break;
+				}
+				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+					hugetlb_vma_unlock_write(vma);
+					flush_tlb_range(vma,
+						range.start, range.end);
+					mmu_notifier_invalidate_range(mm,
+						range.start, range.end);
+
+					/*
+					 * The ref count of the PMD page was
+					 * dropped which is part of the way map
+					 * counting is done for shared PMDs.
+					 * Return 'true' here.  When there is
+					 * no other sharing, huge_pmd_unshare
+					 * returns false and we will unmap the
+					 * actual page and drop map count
+					 * to zero.
+					 */
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+				hugetlb_vma_unlock_write(vma);
 			}
-
 			/* Nuke the hugetlb page table entry */
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
 		} else {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0fdbd2c05587..e24e8a47ce8a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -379,16 +379,21 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		BUG_ON(dst_addr >= dst_start + len);
 
 		/*
-		 * Serialize via hugetlb_fault_mutex.
+		 * Serialize via vma_lock and hugetlb_fault_mutex.
+		 * vma_lock ensures the dst_pte remains valid even
+		 * in the case of shared pmds.  fault mutex prevents
+		 * races with other faulting threads.
 		 */
 		idx = linear_page_index(dst_vma, dst_addr);
 		mapping = dst_vma->vm_file->f_mapping;
 		hash = hugetlb_fault_mutex_hash(mapping, idx);
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
+		hugetlb_vma_lock_read(dst_vma);
 
 		err = -ENOMEM;
 		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
 		if (!dst_pte) {
+			hugetlb_vma_unlock_read(dst_vma);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			goto out_unlock;
 		}
@@ -396,6 +401,7 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		if (mode != MCOPY_ATOMIC_CONTINUE &&
 		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
 			err = -EEXIST;
+			hugetlb_vma_unlock_read(dst_vma);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			goto out_unlock;
 		}
@@ -404,6 +410,7 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					       dst_addr, src_addr, mode, &page,
 					       wp_copy);
 
+		hugetlb_vma_unlock_read(dst_vma);
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
 		cond_resched();