diff mbox series

[v8,1/2] hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing

Message ID 20221108011910.350887-2-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series hugetlb MADV_DONTNEED fix and zap_page_range cleanup | expand

Commit Message

Mike Kravetz Nov. 8, 2022, 1:19 a.m. UTC
madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
tables associated with the address range.  For hugetlb vmas,
zap_page_range will call __unmap_hugepage_range_final.  However,
__unmap_hugepage_range_final assumes the passed vma is about to be removed
and deletes the vma_lock to prevent pmd sharing as the vma is on the way
out.  In the case of madvise(MADV_DONTNEED) the vma remains, but the
missing vma_lock prevents pmd sharing and could potentially lead to issues
with truncation/fault races.

This issue was originally reported here [1] as a BUG triggered in
page_try_dup_anon_rmap.  Prior to the introduction of the hugetlb
vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
prevent pmd sharing.  Subsequent faults on this vma were confused as
VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
not set in new pages added to the page table.  This resulted in pages that
appeared anonymous in a VM_SHARED vma and triggered the BUG.

Address issue by:
- Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
  unmap_vmas().  This is used to indicate the 'final' unmapping of a vma.
  When called via MADV_DONTNEED, this flag is not set and the vm_lock is
  not deleted.
- mmu notification is removed from __unmap_hugepage_range to avoid
  duplication, and notification is added to the other calling routine
  (unmap_hugepage_range).
- notification range in zap_page_range_single is updated to take into
  account the possibility of hugetlb pmd sharing.
- zap_page_range_single renamed to __zap_page_range_single to be used
  internally within mm/memory.c
- zap_vma_range() interface created to zap a range within a single vma.
- madvise_dontneed_single_vma is updated to call zap_vma_range instead of
  zap_page_range as it only operates on a range within a single vma

[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@mail.gmail.com/
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Wei Chen <harperchen1110@gmail.com>
Cc: <stable@vger.kernel.org>
---
 include/linux/mm.h |  5 +++++
 mm/hugetlb.c       | 45 +++++++++++++++++++++++++++------------------
 mm/madvise.c       |  4 ++--
 mm/memory.c        | 17 +++++++++++++----
 4 files changed, 47 insertions(+), 24 deletions(-)

Comments

Peter Xu Nov. 10, 2022, 8:56 p.m. UTC | #1
Hi, Mike,

Sorry to be late, took me quite some time working on another bug..

On Mon, Nov 07, 2022 at 05:19:09PM -0800, Mike Kravetz wrote:
> madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
> tables associated with the address range.  For hugetlb vmas,
> zap_page_range will call __unmap_hugepage_range_final.  However,
> __unmap_hugepage_range_final assumes the passed vma is about to be removed
> and deletes the vma_lock to prevent pmd sharing as the vma is on the way
> out.  In the case of madvise(MADV_DONTNEED) the vma remains, but the
> missing vma_lock prevents pmd sharing and could potentially lead to issues
> with truncation/fault races.
> 
> This issue was originally reported here [1] as a BUG triggered in
> page_try_dup_anon_rmap.  Prior to the introduction of the hugetlb
> vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
> prevent pmd sharing.  Subsequent faults on this vma were confused as
> VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
> not set in new pages added to the page table.  This resulted in pages that
> appeared anonymous in a VM_SHARED vma and triggered the BUG.
> 
> Address issue by:
> - Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
>   unmap_vmas().  This is used to indicate the 'final' unmapping of a vma.
>   When called via MADV_DONTNEED, this flag is not set and the vm_lock is
>   not deleted.
> - mmu notification is removed from __unmap_hugepage_range to avoid
>   duplication, and notification is added to the other calling routine
>   (unmap_hugepage_range).
> - notification range in zap_page_range_single is updated to take into
>   account the possibility of hugetlb pmd sharing.
> - zap_page_range_single renamed to __zap_page_range_single to be used
>   internally within mm/memory.c
> - zap_vma_range() interface created to zap a range within a single vma.
> - madvise_dontneed_single_vma is updated to call zap_vma_range instead of
>   zap_page_range as it only operates on a range within a single vma
> 
> [1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@mail.gmail.com/
> Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> Reported-by: Wei Chen <harperchen1110@gmail.com>
> Cc: <stable@vger.kernel.org>
> ---
>  include/linux/mm.h |  5 +++++
>  mm/hugetlb.c       | 45 +++++++++++++++++++++++++++------------------
>  mm/madvise.c       |  4 ++--
>  mm/memory.c        | 17 +++++++++++++----
>  4 files changed, 47 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 978c17df053e..d205bcd9cd2e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1840,6 +1840,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
>  		  unsigned long size);
>  void zap_page_range(struct vm_area_struct *vma, unsigned long address,
>  		    unsigned long size);
> +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> +		    unsigned long size);
>  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
>  		struct vm_area_struct *start_vma, unsigned long start,
>  		unsigned long end);
> @@ -3464,4 +3466,7 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
>   */
>  #define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
>  
> +/* Set in unmap_vmas() to indicate an unmap call.  Only used by hugetlb */
> +#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))

It seems this is not set anywhere in the patch?

> +
>  #endif /* _LINUX_MM_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ceb47c4e183a..7c8fbce4441e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5072,7 +5072,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
>  	struct page *page;
>  	struct hstate *h = hstate_vma(vma);
>  	unsigned long sz = huge_page_size(h);
> -	struct mmu_notifier_range range;
>  	unsigned long last_addr_mask;
>  	bool force_flush = false;
>  
> @@ -5087,13 +5086,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
>  	tlb_change_page_size(tlb, sz);
>  	tlb_start_vma(tlb, vma);
>  
> -	/*
> -	 * If sharing possible, alert mmu notifiers of worst case.
> -	 */
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
> -				end);
> -	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> -	mmu_notifier_invalidate_range_start(&range);
>  	last_addr_mask = hugetlb_mask_last_page(h);
>  	address = start;
>  	for (; address < end; address += sz) {
> @@ -5178,7 +5170,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
>  		if (ref_page)
>  			break;
>  	}
> -	mmu_notifier_invalidate_range_end(&range);
>  	tlb_end_vma(tlb, vma);
>  
>  	/*
> @@ -5203,32 +5194,50 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
>  			  unsigned long end, struct page *ref_page,
>  			  zap_flags_t zap_flags)
>  {
> +	bool final = zap_flags & ZAP_FLAG_UNMAP;
> +
>  	hugetlb_vma_lock_write(vma);
>  	i_mmap_lock_write(vma->vm_file->f_mapping);
>  
>  	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
>  
>  	/*
> -	 * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
> -	 * the vma_lock is freed, this makes the vma ineligible for pmd
> -	 * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
> -	 * This is important as page tables for this unmapped range will
> -	 * be asynchrously deleted.  If the page tables are shared, there
> -	 * will be issues when accessed by someone else.
> +	 * When called via zap_vma_range (MADV_DONTNEED), this is not the
> +	 * final unmap of the vma, and we do not want to delete the vma_lock.
>  	 */
> -	__hugetlb_vma_unlock_write_free(vma);
> -
> -	i_mmap_unlock_write(vma->vm_file->f_mapping);
> +	if (final) {
> +		/*
> +		 * Unlock and free the vma lock before releasing i_mmap_rwsem.
> +		 * When the vma_lock is freed, this makes the vma ineligible
> +		 * for pmd sharing.  And, i_mmap_rwsem is required to set up
> +		 * pmd sharing.  This is important as page tables for this
> +		 * unmapped range will be asynchrously deleted.  If the page
> +		 * tables are shared, there will be issues when accessed by
> +		 * someone else.
> +		 */
> +		__hugetlb_vma_unlock_write_free(vma);
> +		i_mmap_unlock_write(vma->vm_file->f_mapping);
> +	} else {
> +		i_mmap_unlock_write(vma->vm_file->f_mapping);
> +		hugetlb_vma_unlock_write(vma);
> +	}
>  }
>  
>  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
>  			  unsigned long end, struct page *ref_page,
>  			  zap_flags_t zap_flags)
>  {
> +	struct mmu_notifier_range range;
>  	struct mmu_gather tlb;
>  
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,

Should this be s/UNMAP/CLEAR/?  As IIUC the unmap path was only happening
in __unmap_hugepage_range_final().

> +				start, end);
> +	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
>  	tlb_gather_mmu(&tlb, vma->vm_mm);
> +
>  	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
> +
> +	mmu_notifier_invalidate_range_end(&range);
>  	tlb_finish_mmu(&tlb);
>  }
>  
> diff --git a/mm/madvise.c b/mm/madvise.c
> index c7105ec6d08c..9d2625b8029a 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -772,7 +772,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
>   * Application no longer needs these pages.  If the pages are dirty,
>   * it's OK to just throw them away.  The app will be more careful about
>   * data it wants to keep.  Be sure to free swap resources too.  The
> - * zap_page_range call sets things up for shrink_active_list to actually free
> + * zap_vma_range call sets things up for shrink_active_list to actually free
>   * these pages later if no one else has touched them in the meantime,
>   * although we could add these pages to a global reuse list for
>   * shrink_active_list to pick up before reclaiming other pages.
> @@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
>  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
>  					unsigned long start, unsigned long end)
>  {
> -	zap_page_range(vma, start, end - start);
> +	zap_vma_range(vma, start, end - start);

I'd rather just call zap_page_range_single() directly with NULL passed
over, considering that this is for stable, but no strong opinions.

>  	return 0;
>  }
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 6090124b64f1..af3a4724b464 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1717,7 +1717,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
>  }
>  
>  /**
> - * zap_page_range_single - remove user pages in a given range
> + * __zap_page_range_single - remove user pages in a given range

Same nitpick here, I'd rather keep the name at least for this patch.  But
again, no strong opinion.

>   * @vma: vm_area_struct holding the applicable pages
>   * @address: starting address of pages to zap
>   * @size: number of bytes to zap
> @@ -1725,7 +1725,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
>   *
>   * The range must fit into one VMA.
>   */
> -static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
> +static void __zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
>  		unsigned long size, struct zap_details *details)
>  {
>  	struct mmu_notifier_range range;
> @@ -1734,6 +1734,9 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
>  	lru_add_drain();
>  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
>  				address, address + size);
> +	if (is_vm_hugetlb_page(vma))
> +		adjust_range_if_pmd_sharing_possible(vma, &range.start,
> +							&range.end);
>  	tlb_gather_mmu(&tlb, vma->vm_mm);
>  	update_hiwater_rss(vma->vm_mm);
>  	mmu_notifier_invalidate_range_start(&range);
> @@ -1742,6 +1745,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
>  	tlb_finish_mmu(&tlb);
>  }
>  
> +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> +		unsigned long size)
> +{
> +	__zap_page_range_single(vma, address, size, NULL);
> +}
> +
>  /**
>   * zap_vma_ptes - remove ptes mapping the vma
>   * @vma: vm_area_struct holding ptes to be zapped
> @@ -1760,7 +1769,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
>  	    		!(vma->vm_flags & VM_PFNMAP))
>  		return;
>  
> -	zap_page_range_single(vma, address, size, NULL);
> +	__zap_page_range_single(vma, address, size, NULL);
>  }
>  EXPORT_SYMBOL_GPL(zap_vma_ptes);
>  
> @@ -3453,7 +3462,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
>  		unsigned long start_addr, unsigned long end_addr,
>  		struct zap_details *details)
>  {
> -	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
> +	__zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
>  }
>  
>  static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
> -- 
> 2.37.3
> 
>
Nadav Amit Nov. 10, 2022, 8:59 p.m. UTC | #2
On Nov 7, 2022, at 5:19 PM, Mike Kravetz <mike.kravetz@oracle.com> wrote:

> madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
> tables associated with the address range.  For hugetlb vmas,
> zap_page_range will call __unmap_hugepage_range_final.  However,
> __unmap_hugepage_range_final assumes the passed vma is about to be removed
> and deletes the vma_lock to prevent pmd sharing as the vma is on the way
> out.  In the case of madvise(MADV_DONTNEED) the vma remains, but the
> missing vma_lock prevents pmd sharing and could potentially lead to issues
> with truncation/fault races.

I understand the problem in general. Please consider my feedback as partial
though.


> @@ -5203,32 +5194,50 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> 			  unsigned long end, struct page *ref_page,
> 			  zap_flags_t zap_flags)
> {
> +	bool final = zap_flags & ZAP_FLAG_UNMAP;
> +

Not sure why caching final in local variable helps.

> 
> void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
> 			  unsigned long end, struct page *ref_page,
> 			  zap_flags_t zap_flags)
> {
> +	struct mmu_notifier_range range;
> 	struct mmu_gather tlb;
> 
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
> +				start, end);
> +	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> 	tlb_gather_mmu(&tlb, vma->vm_mm);
> +
> 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);

Is there a reason for not using range.start and range.end?

It is just that every inconsistency is worrying…

> 
> @@ -1734,6 +1734,9 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> 	lru_add_drain();
> 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> 				address, address + size);
> +	if (is_vm_hugetlb_page(vma))
> +		adjust_range_if_pmd_sharing_possible(vma, &range.start,
> +							&range.end);
> 	tlb_gather_mmu(&tlb, vma->vm_mm);
> 	update_hiwater_rss(vma->vm_mm);
> 	mmu_notifier_invalidate_range_start(&range);
> @@ -1742,6 +1745,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> 	tlb_finish_mmu(&tlb);
> }
> 
> +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> +		unsigned long size)
> +{
> +	__zap_page_range_single(vma, address, size, NULL);

Ugh. So zap_vma_range() would actually be emitted as a wrapper function that
only calls __zap_page_range_single() (or worse __zap_page_range_single()
which is large would be inlined), unless you use LTO.

Another option is to declare __zap_page_range_size() in the header and move
this one to the header as inline wrapper.
Peter Xu Nov. 10, 2022, 9:07 p.m. UTC | #3
Maybe it'll also be good if we split this into a few smaller ones?

For example:

On Mon, Nov 07, 2022 at 05:19:09PM -0800, Mike Kravetz wrote:
> Address issue by:
> - Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
>   unmap_vmas().  This is used to indicate the 'final' unmapping of a vma.
>   When called via MADV_DONTNEED, this flag is not set and the vm_lock is
>   not deleted.

One patch to fix the real thing (patch 2).

> - mmu notification is removed from __unmap_hugepage_range to avoid
>   duplication, and notification is added to the other calling routine
>   (unmap_hugepage_range).

One patch to fix the mmu notifier (patch 1).

> - notification range in zap_page_range_single is updated to take into
>   account the possibility of hugetlb pmd sharing.

Part of patch 2.

> - zap_page_range_single renamed to __zap_page_range_single to be used
>   internally within mm/memory.c
> - zap_vma_range() interface created to zap a range within a single vma.

Another patch 3 for the two.

> - madvise_dontneed_single_vma is updated to call zap_vma_range instead of
>   zap_page_range as it only operates on a range within a single vma

Part of patch 2.

Then patch 1 & 2 will need to copy stable, 3 won't need to.  Patch 2 in
this series will be patch 4.  Not sure whether that looks cleaner.

Mike, sorry again if this is too late as comment, feel free to go with
either way you think proper.

Thanks,
Mike Kravetz Nov. 10, 2022, 9:48 p.m. UTC | #4
On 11/10/22 12:59, Nadav Amit wrote:
> On Nov 7, 2022, at 5:19 PM, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> > madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
> > tables associated with the address range.  For hugetlb vmas,
> > zap_page_range will call __unmap_hugepage_range_final.  However,
> > __unmap_hugepage_range_final assumes the passed vma is about to be removed
> > and deletes the vma_lock to prevent pmd sharing as the vma is on the way
> > out.  In the case of madvise(MADV_DONTNEED) the vma remains, but the
> > missing vma_lock prevents pmd sharing and could potentially lead to issues
> > with truncation/fault races.
> 
> I understand the problem in general. Please consider my feedback as partial
> though.

Thanks for taking a look!

> 
> > @@ -5203,32 +5194,50 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> > 			  unsigned long end, struct page *ref_page,
> > 			  zap_flags_t zap_flags)
> > {
> > +	bool final = zap_flags & ZAP_FLAG_UNMAP;
> > +
> 
> Not sure why caching final in local variable helps.

No particular reason.  It can be eliminated.

> 
> > 
> > void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
> > 			  unsigned long end, struct page *ref_page,
> > 			  zap_flags_t zap_flags)
> > {
> > +	struct mmu_notifier_range range;
> > 	struct mmu_gather tlb;
> > 
> > +	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
> > +				start, end);
> > +	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> > 	tlb_gather_mmu(&tlb, vma->vm_mm);
> > +
> > 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
> 
> Is there a reason for not using range.start and range.end?

After calling adjust_range_if_pmd_sharing_possible, range.start - range.end
could be much greater than the range we actually want to unmap.  The range
gets adjusted to account for pmd sharing if that is POSSIBLE.  It does not
know for sure if we will actually 'unshare a pmd'.

I suppose adjust_range_if_pmd_sharing_possible could be modified to actually
check if unmapping will result in unsharing, but it does not do that today.

> 
> It is just that every inconsistency is worrying…
> 
> > 
> > @@ -1734,6 +1734,9 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> > 	lru_add_drain();
> > 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > 				address, address + size);
> > +	if (is_vm_hugetlb_page(vma))
> > +		adjust_range_if_pmd_sharing_possible(vma, &range.start,
> > +							&range.end);
> > 	tlb_gather_mmu(&tlb, vma->vm_mm);
> > 	update_hiwater_rss(vma->vm_mm);
> > 	mmu_notifier_invalidate_range_start(&range);
> > @@ -1742,6 +1745,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> > 	tlb_finish_mmu(&tlb);
> > }
> > 
> > +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> > +		unsigned long size)
> > +{
> > +	__zap_page_range_single(vma, address, size, NULL);
> 
> Ugh. So zap_vma_range() would actually be emitted as a wrapper function that
> only calls __zap_page_range_single() (or worse __zap_page_range_single()
> which is large would be inlined), unless you use LTO.
> 
> Another option is to declare __zap_page_range_size() in the header and move
> this one to the header as inline wrapper.

Ok, I will keep that in mind.

However, Peter seems to prefer just exposing the current zap_page_range_single.  
The issue with exposing zap_page_range_single as it is today, is that we also
need to expose 'struct zap_details' which currently is not visible outside
mm/memory.c.
Mike Kravetz Nov. 10, 2022, 9:55 p.m. UTC | #5
On 11/10/22 15:56, Peter Xu wrote:
> Hi, Mike,
> 
> Sorry to be late, took me quite some time working on another bug..
> 
> On Mon, Nov 07, 2022 at 05:19:09PM -0800, Mike Kravetz wrote:
> > madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
> > tables associated with the address range.  For hugetlb vmas,
> > zap_page_range will call __unmap_hugepage_range_final.  However,
> > __unmap_hugepage_range_final assumes the passed vma is about to be removed
> > and deletes the vma_lock to prevent pmd sharing as the vma is on the way
> > out.  In the case of madvise(MADV_DONTNEED) the vma remains, but the
> > missing vma_lock prevents pmd sharing and could potentially lead to issues
> > with truncation/fault races.
> > 
> > This issue was originally reported here [1] as a BUG triggered in
> > page_try_dup_anon_rmap.  Prior to the introduction of the hugetlb
> > vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
> > prevent pmd sharing.  Subsequent faults on this vma were confused as
> > VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
> > not set in new pages added to the page table.  This resulted in pages that
> > appeared anonymous in a VM_SHARED vma and triggered the BUG.
> > 
> > Address issue by:
> > - Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
> >   unmap_vmas().  This is used to indicate the 'final' unmapping of a vma.
> >   When called via MADV_DONTNEED, this flag is not set and the vm_lock is
> >   not deleted.
> > - mmu notification is removed from __unmap_hugepage_range to avoid
> >   duplication, and notification is added to the other calling routine
> >   (unmap_hugepage_range).
> > - notification range in zap_page_range_single is updated to take into
> >   account the possibility of hugetlb pmd sharing.
> > - zap_page_range_single renamed to __zap_page_range_single to be used
> >   internally within mm/memory.c
> > - zap_vma_range() interface created to zap a range within a single vma.
> > - madvise_dontneed_single_vma is updated to call zap_vma_range instead of
> >   zap_page_range as it only operates on a range within a single vma
> > 
> > [1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@mail.gmail.com/
> > Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > Reported-by: Wei Chen <harperchen1110@gmail.com>
> > Cc: <stable@vger.kernel.org>
> > ---
> >  include/linux/mm.h |  5 +++++
> >  mm/hugetlb.c       | 45 +++++++++++++++++++++++++++------------------
> >  mm/madvise.c       |  4 ++--
> >  mm/memory.c        | 17 +++++++++++++----
> >  4 files changed, 47 insertions(+), 24 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 978c17df053e..d205bcd9cd2e 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1840,6 +1840,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
> >  		  unsigned long size);
> >  void zap_page_range(struct vm_area_struct *vma, unsigned long address,
> >  		    unsigned long size);
> > +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> > +		    unsigned long size);
> >  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
> >  		struct vm_area_struct *start_vma, unsigned long start,
> >  		unsigned long end);
> > @@ -3464,4 +3466,7 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> >   */
> >  #define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
> >  
> > +/* Set in unmap_vmas() to indicate an unmap call.  Only used by hugetlb */
> > +#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
> 
> It seems this is not set anywhere in the patch?

Correct.  Should be set in unmap_vmas.

> > +
> >  #endif /* _LINUX_MM_H */
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index ceb47c4e183a..7c8fbce4441e 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -5072,7 +5072,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> >  	struct page *page;
> >  	struct hstate *h = hstate_vma(vma);
> >  	unsigned long sz = huge_page_size(h);
> > -	struct mmu_notifier_range range;
> >  	unsigned long last_addr_mask;
> >  	bool force_flush = false;
> >  
> > @@ -5087,13 +5086,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> >  	tlb_change_page_size(tlb, sz);
> >  	tlb_start_vma(tlb, vma);
> >  
> > -	/*
> > -	 * If sharing possible, alert mmu notifiers of worst case.
> > -	 */
> > -	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
> > -				end);
> > -	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> > -	mmu_notifier_invalidate_range_start(&range);
> >  	last_addr_mask = hugetlb_mask_last_page(h);
> >  	address = start;
> >  	for (; address < end; address += sz) {
> > @@ -5178,7 +5170,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> >  		if (ref_page)
> >  			break;
> >  	}
> > -	mmu_notifier_invalidate_range_end(&range);
> >  	tlb_end_vma(tlb, vma);
> >  
> >  	/*
> > @@ -5203,32 +5194,50 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> >  			  unsigned long end, struct page *ref_page,
> >  			  zap_flags_t zap_flags)
> >  {
> > +	bool final = zap_flags & ZAP_FLAG_UNMAP;
> > +
> >  	hugetlb_vma_lock_write(vma);
> >  	i_mmap_lock_write(vma->vm_file->f_mapping);
> >  
> >  	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
> >  
> >  	/*
> > -	 * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
> > -	 * the vma_lock is freed, this makes the vma ineligible for pmd
> > -	 * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
> > -	 * This is important as page tables for this unmapped range will
> > -	 * be asynchrously deleted.  If the page tables are shared, there
> > -	 * will be issues when accessed by someone else.
> > +	 * When called via zap_vma_range (MADV_DONTNEED), this is not the
> > +	 * final unmap of the vma, and we do not want to delete the vma_lock.
> >  	 */
> > -	__hugetlb_vma_unlock_write_free(vma);
> > -
> > -	i_mmap_unlock_write(vma->vm_file->f_mapping);
> > +	if (final) {
> > +		/*
> > +		 * Unlock and free the vma lock before releasing i_mmap_rwsem.
> > +		 * When the vma_lock is freed, this makes the vma ineligible
> > +		 * for pmd sharing.  And, i_mmap_rwsem is required to set up
> > +		 * pmd sharing.  This is important as page tables for this
> > +		 * unmapped range will be asynchrously deleted.  If the page
> > +		 * tables are shared, there will be issues when accessed by
> > +		 * someone else.
> > +		 */
> > +		__hugetlb_vma_unlock_write_free(vma);
> > +		i_mmap_unlock_write(vma->vm_file->f_mapping);
> > +	} else {
> > +		i_mmap_unlock_write(vma->vm_file->f_mapping);
> > +		hugetlb_vma_unlock_write(vma);
> > +	}
> >  }
> >  
> >  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
> >  			  unsigned long end, struct page *ref_page,
> >  			  zap_flags_t zap_flags)
> >  {
> > +	struct mmu_notifier_range range;
> >  	struct mmu_gather tlb;
> >  
> > +	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
> 
> Should this be s/UNMAP/CLEAR/?  As IIUC the unmap path was only happening
> in __unmap_hugepage_range_final().

Right, unmap_hugepage_range is employed in the truncate and hole punch
path where we should be using the CLEAR notifier.

> > +				start, end);
> > +	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> >  	tlb_gather_mmu(&tlb, vma->vm_mm);
> > +
> >  	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
> > +
> > +	mmu_notifier_invalidate_range_end(&range);
> >  	tlb_finish_mmu(&tlb);
> >  }
> >  
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index c7105ec6d08c..9d2625b8029a 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -772,7 +772,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
> >   * Application no longer needs these pages.  If the pages are dirty,
> >   * it's OK to just throw them away.  The app will be more careful about
> >   * data it wants to keep.  Be sure to free swap resources too.  The
> > - * zap_page_range call sets things up for shrink_active_list to actually free
> > + * zap_vma_range call sets things up for shrink_active_list to actually free
> >   * these pages later if no one else has touched them in the meantime,
> >   * although we could add these pages to a global reuse list for
> >   * shrink_active_list to pick up before reclaiming other pages.
> > @@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
> >  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
> >  					unsigned long start, unsigned long end)
> >  {
> > -	zap_page_range(vma, start, end - start);
> > +	zap_vma_range(vma, start, end - start);
> 
> I'd rather just call zap_page_range_single() directly with NULL passed
> over, considering that this is for stable, but no strong opinions.

As mentioned in reply to Nadav, if we expose zap_page_range_single we
need to expose 'struct zap_details'.  Any concerns there?  That was the
primary reason I did not just call zap_page_range_single directly.
Peter Xu Nov. 10, 2022, 10:07 p.m. UTC | #6
On Thu, Nov 10, 2022 at 01:48:06PM -0800, Mike Kravetz wrote:
> However, Peter seems to prefer just exposing the current zap_page_range_single.  
> The issue with exposing zap_page_range_single as it is today, is that we also
> need to expose 'struct zap_details' which currently is not visible outside
> mm/memory.c.

Ah I see, that's okay to me.  Thanks,
Nadav Amit Nov. 10, 2022, 10:22 p.m. UTC | #7
On Nov 10, 2022, at 1:48 PM, Mike Kravetz <mike.kravetz@oracle.com> wrote:

>>> void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
>>> 			  unsigned long end, struct page *ref_page,
>>> 			  zap_flags_t zap_flags)
>>> {
>>> +	struct mmu_notifier_range range;
>>> 	struct mmu_gather tlb;
>>> 
>>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
>>> +				start, end);
>>> +	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
>>> 	tlb_gather_mmu(&tlb, vma->vm_mm);
>>> +
>>> 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
>> 
>> Is there a reason for not using range.start and range.end?
> 
> After calling adjust_range_if_pmd_sharing_possible, range.start - range.end
> could be much greater than the range we actually want to unmap.  The range
> gets adjusted to account for pmd sharing if that is POSSIBLE.  It does not
> know for sure if we will actually 'unshare a pmd'.
> 
> I suppose adjust_range_if_pmd_sharing_possible could be modified to actually
> check if unmapping will result in unsharing, but it does not do that today.

Thanks for the explanation. It’s probably me, but I am still not sure that I
understand the the different between __unmap_hugepage_range() using (start,
end) and __zap_page_range_single() using (address, range.end). Perhaps it
worth a comment in the code?

But anyhow… shouldn’t unmap_hugepage_range() call
mmu_notifier_invalidate_range_start()?
Mike Kravetz Nov. 10, 2022, 10:31 p.m. UTC | #8
On 11/10/22 14:22, Nadav Amit wrote:
> On Nov 10, 2022, at 1:48 PM, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> >>> void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
> >>> 			  unsigned long end, struct page *ref_page,
> >>> 			  zap_flags_t zap_flags)
> >>> {
> >>> +	struct mmu_notifier_range range;
> >>> 	struct mmu_gather tlb;
> >>> 
> >>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
> >>> +				start, end);
> >>> +	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> >>> 	tlb_gather_mmu(&tlb, vma->vm_mm);
> >>> +
> >>> 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
> >> 
> >> Is there a reason for not using range.start and range.end?
> > 
> > After calling adjust_range_if_pmd_sharing_possible, range.start - range.end
> > could be much greater than the range we actually want to unmap.  The range
> > gets adjusted to account for pmd sharing if that is POSSIBLE.  It does not
> > know for sure if we will actually 'unshare a pmd'.
> > 
> > I suppose adjust_range_if_pmd_sharing_possible could be modified to actually
> > check if unmapping will result in unsharing, but it does not do that today.
> 
> Thanks for the explanation. It’s probably me, but I am still not sure that I
> understand the the different between __unmap_hugepage_range() using (start,
> end) and __zap_page_range_single() using (address, range.end). Perhaps it
> worth a comment in the code?

__zap_page_range_single is wrong.  It should have been updated to use
the range address - (address + size).

At Peter's suggestion the mmu notifier updates will be broken out in a
separate patch.  I will also add comments, to make this easier to follow.

> But anyhow… shouldn’t unmap_hugepage_range() call
> mmu_notifier_invalidate_range_start()?

Yes, thanks!
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 978c17df053e..d205bcd9cd2e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1840,6 +1840,8 @@  void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		  unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		    unsigned long size);
+void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
+		    unsigned long size);
 void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
 		struct vm_area_struct *start_vma, unsigned long start,
 		unsigned long end);
@@ -3464,4 +3466,7 @@  madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  */
 #define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
 
+/* Set in unmap_vmas() to indicate an unmap call.  Only used by hugetlb */
+#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
+
 #endif /* _LINUX_MM_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ceb47c4e183a..7c8fbce4441e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5072,7 +5072,6 @@  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
-	struct mmu_notifier_range range;
 	unsigned long last_addr_mask;
 	bool force_flush = false;
 
@@ -5087,13 +5086,6 @@  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	tlb_change_page_size(tlb, sz);
 	tlb_start_vma(tlb, vma);
 
-	/*
-	 * If sharing possible, alert mmu notifiers of worst case.
-	 */
-	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
-				end);
-	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
-	mmu_notifier_invalidate_range_start(&range);
 	last_addr_mask = hugetlb_mask_last_page(h);
 	address = start;
 	for (; address < end; address += sz) {
@@ -5178,7 +5170,6 @@  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		if (ref_page)
 			break;
 	}
-	mmu_notifier_invalidate_range_end(&range);
 	tlb_end_vma(tlb, vma);
 
 	/*
@@ -5203,32 +5194,50 @@  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  unsigned long end, struct page *ref_page,
 			  zap_flags_t zap_flags)
 {
+	bool final = zap_flags & ZAP_FLAG_UNMAP;
+
 	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 
 	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
 
 	/*
-	 * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
-	 * the vma_lock is freed, this makes the vma ineligible for pmd
-	 * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
-	 * This is important as page tables for this unmapped range will
-	 * be asynchrously deleted.  If the page tables are shared, there
-	 * will be issues when accessed by someone else.
+	 * When called via zap_vma_range (MADV_DONTNEED), this is not the
+	 * final unmap of the vma, and we do not want to delete the vma_lock.
 	 */
-	__hugetlb_vma_unlock_write_free(vma);
-
-	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	if (final) {
+		/*
+		 * Unlock and free the vma lock before releasing i_mmap_rwsem.
+		 * When the vma_lock is freed, this makes the vma ineligible
+		 * for pmd sharing.  And, i_mmap_rwsem is required to set up
+		 * pmd sharing.  This is important as page tables for this
+		 * unmapped range will be asynchrously deleted.  If the page
+		 * tables are shared, there will be issues when accessed by
+		 * someone else.
+		 */
+		__hugetlb_vma_unlock_write_free(vma);
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+	} else {
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+		hugetlb_vma_unlock_write(vma);
+	}
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			  unsigned long end, struct page *ref_page,
 			  zap_flags_t zap_flags)
 {
+	struct mmu_notifier_range range;
 	struct mmu_gather tlb;
 
+	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+				start, end);
+	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 	tlb_gather_mmu(&tlb, vma->vm_mm);
+
 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+
+	mmu_notifier_invalidate_range_end(&range);
 	tlb_finish_mmu(&tlb);
 }
 
diff --git a/mm/madvise.c b/mm/madvise.c
index c7105ec6d08c..9d2625b8029a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -772,7 +772,7 @@  static int madvise_free_single_vma(struct vm_area_struct *vma,
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
  * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for shrink_active_list to actually free
+ * zap_vma_range call sets things up for shrink_active_list to actually free
  * these pages later if no one else has touched them in the meantime,
  * although we could add these pages to a global reuse list for
  * shrink_active_list to pick up before reclaiming other pages.
@@ -790,7 +790,7 @@  static int madvise_free_single_vma(struct vm_area_struct *vma,
 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
 					unsigned long start, unsigned long end)
 {
-	zap_page_range(vma, start, end - start);
+	zap_vma_range(vma, start, end - start);
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 6090124b64f1..af3a4724b464 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1717,7 +1717,7 @@  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 /**
- * zap_page_range_single - remove user pages in a given range
+ * __zap_page_range_single - remove user pages in a given range
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
@@ -1725,7 +1725,7 @@  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  *
  * The range must fit into one VMA.
  */
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+static void __zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
 	struct mmu_notifier_range range;
@@ -1734,6 +1734,9 @@  static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
 	lru_add_drain();
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 				address, address + size);
+	if (is_vm_hugetlb_page(vma))
+		adjust_range_if_pmd_sharing_possible(vma, &range.start,
+							&range.end);
 	tlb_gather_mmu(&tlb, vma->vm_mm);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
@@ -1742,6 +1745,12 @@  static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
 	tlb_finish_mmu(&tlb);
 }
 
+void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
+		unsigned long size)
+{
+	__zap_page_range_single(vma, address, size, NULL);
+}
+
 /**
  * zap_vma_ptes - remove ptes mapping the vma
  * @vma: vm_area_struct holding ptes to be zapped
@@ -1760,7 +1769,7 @@  void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 	    		!(vma->vm_flags & VM_PFNMAP))
 		return;
 
-	zap_page_range_single(vma, address, size, NULL);
+	__zap_page_range_single(vma, address, size, NULL);
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
@@ -3453,7 +3462,7 @@  static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
 {
-	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
+	__zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 
 static inline void unmap_mapping_range_tree(struct rb_root_cached *root,