diff mbox series

[21/23] hugetlb/userfaultfd: Only drop uffd-wp special pte if required

Message ID 20210323005054.35973-1-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd-wp: Support shmem and hugetlbfs | expand

Commit Message

Peter Xu March 23, 2021, 12:50 a.m. UTC
Just like what we've done with shmem uffd-wp special ptes, we shouldn't drop
uffd-wp special swap pte for hugetlb too, only if we're going to unmap the
whole vma, or we're punching a hole with safe locks held.

For example, remove_inode_hugepages() is safe to drop uffd-wp ptes, because it
has taken hugetlb fault mutex so that no concurrent page fault would trigger.
While the call to hugetlb_vmdelete_list() in hugetlbfs_punch_hole() is not
safe.  That's why the previous call will be with ZAP_FLAG_DROP_FILE_UFFD_WP,
while the latter one won't be able to.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 fs/hugetlbfs/inode.c    | 15 +++++++++------
 include/linux/hugetlb.h | 13 ++++++++-----
 mm/hugetlb.c            | 27 +++++++++++++++++++++------
 mm/memory.c             |  5 ++++-
 4 files changed, 42 insertions(+), 18 deletions(-)

Comments

Mike Kravetz April 23, 2021, 8:33 p.m. UTC | #1
On 3/22/21 5:50 PM, Peter Xu wrote:
> Just like what we've done with shmem uffd-wp special ptes, we shouldn't drop
> uffd-wp special swap pte for hugetlb too, only if we're going to unmap the
> whole vma, or we're punching a hole with safe locks held.
> 
> For example, remove_inode_hugepages() is safe to drop uffd-wp ptes, because it
> has taken hugetlb fault mutex so that no concurrent page fault would trigger.
> While the call to hugetlb_vmdelete_list() in hugetlbfs_punch_hole() is not
> safe.  That's why the previous call will be with ZAP_FLAG_DROP_FILE_UFFD_WP,
> while the latter one won't be able to.

This commit message is a bit confusing, but the hugetlb hole punch code
path is a bit confusing. :)   How about something like this?

As with  shmem uffd-wp special ptes, only drop the uffd-wp special swap
pte if unmapping an entire vma or synchronized such that faults can not
race with the unmap operation.  This requires passing zap_flags all the
way to the lowest level hugetlb unmap routine: __unmap_hugepage_range.

In general, unmap calls originated in hugetlbfs code will pass the
ZAP_FLAG_DROP_FILE_UFFD_WP flag as synchronization is in place to prevent
faults.  The exception is hole punch which will first unmap without any
synchronization.  Later when hole punch actually removes the page from
the file, it will check to see if there was a subsequent fault and if
so take the hugetlb fault mutex while unmapping again.  This second
unmap will pass in ZAP_FLAG_DROP_FILE_UFFD_WP.

> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  fs/hugetlbfs/inode.c    | 15 +++++++++------
>  include/linux/hugetlb.h | 13 ++++++++-----
>  mm/hugetlb.c            | 27 +++++++++++++++++++++------
>  mm/memory.c             |  5 ++++-
>  4 files changed, 42 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index d81f52b87bd7..5fe19e801a2b 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -399,7 +399,8 @@ static void remove_huge_page(struct page *page)
>  }
>  
>  static void
> -hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
> +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
> +		      unsigned long zap_flags)
>  {
>  	struct vm_area_struct *vma;
>  
> @@ -432,7 +433,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
>  		}
>  
>  		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
> -									NULL);
> +				     NULL, zap_flags);
>  	}
>  }
>  
> @@ -513,7 +514,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>  				mutex_lock(&hugetlb_fault_mutex_table[hash]);
>  				hugetlb_vmdelete_list(&mapping->i_mmap,
>  					index * pages_per_huge_page(h),
> -					(index + 1) * pages_per_huge_page(h));
> +					(index + 1) * pages_per_huge_page(h),
> +					ZAP_FLAG_DROP_FILE_UFFD_WP);
>  				i_mmap_unlock_write(mapping);
>  			}
>  
> @@ -579,7 +581,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
>  	i_mmap_lock_write(mapping);
>  	i_size_write(inode, offset);
>  	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
> -		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
> +		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
> +				      ZAP_FLAG_DROP_FILE_UFFD_WP);
>  	i_mmap_unlock_write(mapping);
>  	remove_inode_hugepages(inode, offset, LLONG_MAX);
>  }
> @@ -612,8 +615,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		i_mmap_lock_write(mapping);
>  		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
>  			hugetlb_vmdelete_list(&mapping->i_mmap,
> -						hole_start >> PAGE_SHIFT,
> -						hole_end  >> PAGE_SHIFT);
> +					      hole_start >> PAGE_SHIFT,
> +					      hole_end >> PAGE_SHIFT, 0);
>  		i_mmap_unlock_write(mapping);
>  		remove_inode_hugepages(inode, hole_start, hole_end);
>  		inode_unlock(inode);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 92710600596e..4047fa042782 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -121,14 +121,15 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>  			 unsigned long *, unsigned long *, long, unsigned int,
>  			 int *);
>  void unmap_hugepage_range(struct vm_area_struct *,
> -			  unsigned long, unsigned long, struct page *);
> +			  unsigned long, unsigned long, struct page *,
> +			  unsigned long);
>  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
>  			  struct vm_area_struct *vma,
>  			  unsigned long start, unsigned long end,
> -			  struct page *ref_page);
> +			  struct page *ref_page, unsigned long zap_flags);
>  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  				unsigned long start, unsigned long end,
> -				struct page *ref_page);
> +				struct page *ref_page, unsigned long zap_flags);

Nothing introduced with your patch, but it seems __unmap_hugepage_range_final
does not need to be in the header and can be static in hugetlb.c.

Everything else looks good,

Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Peter Xu April 26, 2021, 9:16 p.m. UTC | #2
On Fri, Apr 23, 2021 at 01:33:08PM -0700, Mike Kravetz wrote:
> On 3/22/21 5:50 PM, Peter Xu wrote:
> > Just like what we've done with shmem uffd-wp special ptes, we shouldn't drop
> > uffd-wp special swap pte for hugetlb too, only if we're going to unmap the
> > whole vma, or we're punching a hole with safe locks held.
> > 
> > For example, remove_inode_hugepages() is safe to drop uffd-wp ptes, because it
> > has taken hugetlb fault mutex so that no concurrent page fault would trigger.
> > While the call to hugetlb_vmdelete_list() in hugetlbfs_punch_hole() is not
> > safe.  That's why the previous call will be with ZAP_FLAG_DROP_FILE_UFFD_WP,
> > while the latter one won't be able to.
> 
> This commit message is a bit confusing, but the hugetlb hole punch code
> path is a bit confusing. :)   How about something like this?
> 
> As with  shmem uffd-wp special ptes, only drop the uffd-wp special swap
> pte if unmapping an entire vma or synchronized such that faults can not
> race with the unmap operation.  This requires passing zap_flags all the
> way to the lowest level hugetlb unmap routine: __unmap_hugepage_range.
> 
> In general, unmap calls originated in hugetlbfs code will pass the
> ZAP_FLAG_DROP_FILE_UFFD_WP flag as synchronization is in place to prevent
> faults.  The exception is hole punch which will first unmap without any
> synchronization.  Later when hole punch actually removes the page from
> the file, it will check to see if there was a subsequent fault and if
> so take the hugetlb fault mutex while unmapping again.  This second
> unmap will pass in ZAP_FLAG_DROP_FILE_UFFD_WP.

Sure, I can replace mine.

Maybe it's because I didn't explain enough on the reasoning so it's confusing.
The core justification of "whether to apply ZAP_FLAG_DROP_FILE_UFFD_WP flag
when unmap a hugetlb range" is (IMHO): we should never reach a state when a
page fault could errornously fault in a page-cache page that was wr-protected
to be writable, even in an extremely short period.  That could happen if
e.g. we pass ZAP_FLAG_DROP_FILE_UFFD_WP in hugetlbfs_punch_hole() when calling
hugetlb_vmdelete_list(), because if a page fault triggers after that call and
before the remove_inode_hugepages() right after it, the page cache can be
mapped writable again in the small window, which can cause data corruption.

> 
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  fs/hugetlbfs/inode.c    | 15 +++++++++------
> >  include/linux/hugetlb.h | 13 ++++++++-----
> >  mm/hugetlb.c            | 27 +++++++++++++++++++++------
> >  mm/memory.c             |  5 ++++-
> >  4 files changed, 42 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> > index d81f52b87bd7..5fe19e801a2b 100644
> > --- a/fs/hugetlbfs/inode.c
> > +++ b/fs/hugetlbfs/inode.c
> > @@ -399,7 +399,8 @@ static void remove_huge_page(struct page *page)
> >  }
> >  
> >  static void
> > -hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
> > +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
> > +		      unsigned long zap_flags)
> >  {
> >  	struct vm_area_struct *vma;
> >  
> > @@ -432,7 +433,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
> >  		}
> >  
> >  		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
> > -									NULL);
> > +				     NULL, zap_flags);
> >  	}
> >  }
> >  
> > @@ -513,7 +514,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> >  				mutex_lock(&hugetlb_fault_mutex_table[hash]);
> >  				hugetlb_vmdelete_list(&mapping->i_mmap,
> >  					index * pages_per_huge_page(h),
> > -					(index + 1) * pages_per_huge_page(h));
> > +					(index + 1) * pages_per_huge_page(h),
> > +					ZAP_FLAG_DROP_FILE_UFFD_WP);
> >  				i_mmap_unlock_write(mapping);
> >  			}
> >  
> > @@ -579,7 +581,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
> >  	i_mmap_lock_write(mapping);
> >  	i_size_write(inode, offset);
> >  	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
> > -		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
> > +		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
> > +				      ZAP_FLAG_DROP_FILE_UFFD_WP);
> >  	i_mmap_unlock_write(mapping);
> >  	remove_inode_hugepages(inode, offset, LLONG_MAX);
> >  }
> > @@ -612,8 +615,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> >  		i_mmap_lock_write(mapping);
> >  		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
> >  			hugetlb_vmdelete_list(&mapping->i_mmap,
> > -						hole_start >> PAGE_SHIFT,
> > -						hole_end  >> PAGE_SHIFT);
> > +					      hole_start >> PAGE_SHIFT,
> > +					      hole_end >> PAGE_SHIFT, 0);
> >  		i_mmap_unlock_write(mapping);
> >  		remove_inode_hugepages(inode, hole_start, hole_end);
> >  		inode_unlock(inode);
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 92710600596e..4047fa042782 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -121,14 +121,15 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
> >  			 unsigned long *, unsigned long *, long, unsigned int,
> >  			 int *);
> >  void unmap_hugepage_range(struct vm_area_struct *,
> > -			  unsigned long, unsigned long, struct page *);
> > +			  unsigned long, unsigned long, struct page *,
> > +			  unsigned long);
> >  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> >  			  struct vm_area_struct *vma,
> >  			  unsigned long start, unsigned long end,
> > -			  struct page *ref_page);
> > +			  struct page *ref_page, unsigned long zap_flags);
> >  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >  				unsigned long start, unsigned long end,
> > -				struct page *ref_page);
> > +				struct page *ref_page, unsigned long zap_flags);
> 
> Nothing introduced with your patch, but it seems __unmap_hugepage_range_final
> does not need to be in the header and can be static in hugetlb.c.

It seems to be used in unmap_single_vma() of mm/memory.c?

> 
> Everything else looks good,
> 
> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>

Please let me know whether you want my extra paragraph in the commit message,
then I'll coordinate accordingly with the R-b.  Thanks!
Mike Kravetz April 26, 2021, 9:36 p.m. UTC | #3
On 4/26/21 2:16 PM, Peter Xu wrote:
> On Fri, Apr 23, 2021 at 01:33:08PM -0700, Mike Kravetz wrote:
>> On 3/22/21 5:50 PM, Peter Xu wrote:
>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>> index 92710600596e..4047fa042782 100644
>>> --- a/include/linux/hugetlb.h
>>> +++ b/include/linux/hugetlb.h
>>> @@ -121,14 +121,15 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>>>  			 unsigned long *, unsigned long *, long, unsigned int,
>>>  			 int *);
>>>  void unmap_hugepage_range(struct vm_area_struct *,
>>> -			  unsigned long, unsigned long, struct page *);
>>> +			  unsigned long, unsigned long, struct page *,
>>> +			  unsigned long);
>>>  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
>>>  			  struct vm_area_struct *vma,
>>>  			  unsigned long start, unsigned long end,
>>> -			  struct page *ref_page);
>>> +			  struct page *ref_page, unsigned long zap_flags);
>>>  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>>  				unsigned long start, unsigned long end,
>>> -				struct page *ref_page);
>>> +				struct page *ref_page, unsigned long zap_flags);
>>
>> Nothing introduced with your patch, but it seems __unmap_hugepage_range_final
>> does not need to be in the header and can be static in hugetlb.c.
> 
> It seems to be used in unmap_single_vma() of mm/memory.c?

Sorry, that should have been __unmap_hugepage_range.  No need for you to
address in this series.

>>
>> Everything else looks good,
>>
>> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
> 
> Please let me know whether you want my extra paragraph in the commit message,
> then I'll coordinate accordingly with the R-b.  Thanks!

Yes, please do add the extra paragraph.
Peter Xu April 26, 2021, 10:05 p.m. UTC | #4
On Mon, Apr 26, 2021 at 02:36:26PM -0700, Mike Kravetz wrote:
> On 4/26/21 2:16 PM, Peter Xu wrote:
> > On Fri, Apr 23, 2021 at 01:33:08PM -0700, Mike Kravetz wrote:
> >> On 3/22/21 5:50 PM, Peter Xu wrote:
> >>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> >>> index 92710600596e..4047fa042782 100644
> >>> --- a/include/linux/hugetlb.h
> >>> +++ b/include/linux/hugetlb.h
> >>> @@ -121,14 +121,15 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
> >>>  			 unsigned long *, unsigned long *, long, unsigned int,
> >>>  			 int *);
> >>>  void unmap_hugepage_range(struct vm_area_struct *,
> >>> -			  unsigned long, unsigned long, struct page *);
> >>> +			  unsigned long, unsigned long, struct page *,
> >>> +			  unsigned long);
> >>>  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> >>>  			  struct vm_area_struct *vma,
> >>>  			  unsigned long start, unsigned long end,
> >>> -			  struct page *ref_page);
> >>> +			  struct page *ref_page, unsigned long zap_flags);
> >>>  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >>>  				unsigned long start, unsigned long end,
> >>> -				struct page *ref_page);
> >>> +				struct page *ref_page, unsigned long zap_flags);
> >>
> >> Nothing introduced with your patch, but it seems __unmap_hugepage_range_final
> >> does not need to be in the header and can be static in hugetlb.c.
> > 
> > It seems to be used in unmap_single_vma() of mm/memory.c?
> 
> Sorry, that should have been __unmap_hugepage_range.  No need for you to
> address in this series.

Ah yes; I'd rather add a patch if you won't object, since my series will touch
that definition.

> 
> >>
> >> Everything else looks good,
> >>
> >> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
> > 
> > Please let me know whether you want my extra paragraph in the commit message,
> > then I'll coordinate accordingly with the R-b.  Thanks!
> 
> Yes, please do add the extra paragraph.  

Will do.
Mike Kravetz April 26, 2021, 11:09 p.m. UTC | #5
On 4/26/21 3:05 PM, Peter Xu wrote:
> On Mon, Apr 26, 2021 at 02:36:26PM -0700, Mike Kravetz wrote:
>> On 4/26/21 2:16 PM, Peter Xu wrote:
>>> On Fri, Apr 23, 2021 at 01:33:08PM -0700, Mike Kravetz wrote:
>>>> On 3/22/21 5:50 PM, Peter Xu wrote:
>>>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>>>> index 92710600596e..4047fa042782 100644
>>>>> --- a/include/linux/hugetlb.h
>>>>> +++ b/include/linux/hugetlb.h
>>>>> @@ -121,14 +121,15 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>>>>>  			 unsigned long *, unsigned long *, long, unsigned int,
>>>>>  			 int *);
>>>>>  void unmap_hugepage_range(struct vm_area_struct *,
>>>>> -			  unsigned long, unsigned long, struct page *);
>>>>> +			  unsigned long, unsigned long, struct page *,
>>>>> +			  unsigned long);
>>>>>  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
>>>>>  			  struct vm_area_struct *vma,
>>>>>  			  unsigned long start, unsigned long end,
>>>>> -			  struct page *ref_page);
>>>>> +			  struct page *ref_page, unsigned long zap_flags);
>>>>>  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>>>>  				unsigned long start, unsigned long end,
>>>>> -				struct page *ref_page);
>>>>> +				struct page *ref_page, unsigned long zap_flags);
>>>>
>>>> Nothing introduced with your patch, but it seems __unmap_hugepage_range_final
>>>> does not need to be in the header and can be static in hugetlb.c.
>>>
>>> It seems to be used in unmap_single_vma() of mm/memory.c?
>>
>> Sorry, that should have been __unmap_hugepage_range.  No need for you to
>> address in this series.
> 
> Ah yes; I'd rather add a patch if you won't object, since my series will touch
> that definition.
> 

No objection if you want to fix, thanks.
diff mbox series

Patch

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d81f52b87bd7..5fe19e801a2b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -399,7 +399,8 @@  static void remove_huge_page(struct page *page)
 }
 
 static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+		      unsigned long zap_flags)
 {
 	struct vm_area_struct *vma;
 
@@ -432,7 +433,7 @@  hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
 		}
 
 		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
-									NULL);
+				     NULL, zap_flags);
 	}
 }
 
@@ -513,7 +514,8 @@  static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 				mutex_lock(&hugetlb_fault_mutex_table[hash]);
 				hugetlb_vmdelete_list(&mapping->i_mmap,
 					index * pages_per_huge_page(h),
-					(index + 1) * pages_per_huge_page(h));
+					(index + 1) * pages_per_huge_page(h),
+					ZAP_FLAG_DROP_FILE_UFFD_WP);
 				i_mmap_unlock_write(mapping);
 			}
 
@@ -579,7 +581,8 @@  static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	i_mmap_lock_write(mapping);
 	i_size_write(inode, offset);
 	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
-		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+				      ZAP_FLAG_DROP_FILE_UFFD_WP);
 	i_mmap_unlock_write(mapping);
 	remove_inode_hugepages(inode, offset, LLONG_MAX);
 }
@@ -612,8 +615,8 @@  static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		i_mmap_lock_write(mapping);
 		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
 			hugetlb_vmdelete_list(&mapping->i_mmap,
-						hole_start >> PAGE_SHIFT,
-						hole_end  >> PAGE_SHIFT);
+					      hole_start >> PAGE_SHIFT,
+					      hole_end >> PAGE_SHIFT, 0);
 		i_mmap_unlock_write(mapping);
 		remove_inode_hugepages(inode, hole_start, hole_end);
 		inode_unlock(inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 92710600596e..4047fa042782 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -121,14 +121,15 @@  long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 unsigned long *, unsigned long *, long, unsigned int,
 			 int *);
 void unmap_hugepage_range(struct vm_area_struct *,
-			  unsigned long, unsigned long, struct page *);
+			  unsigned long, unsigned long, struct page *,
+			  unsigned long);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end,
-			  struct page *ref_page);
+			  struct page *ref_page, unsigned long zap_flags);
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				unsigned long start, unsigned long end,
-				struct page *ref_page);
+				struct page *ref_page, unsigned long zap_flags);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(char *buf, int len, int nid);
 void hugetlb_show_meminfo(void);
@@ -361,14 +362,16 @@  static inline unsigned long hugetlb_change_protection(
 
 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
-			unsigned long end, struct page *ref_page)
+			unsigned long end, struct page *ref_page,
+			unsigned long zap_flags)
 {
 	BUG();
 }
 
 static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
-			unsigned long end, struct page *ref_page)
+			unsigned long end, struct page *ref_page,
+			unsigned long zap_flags)
 {
 	BUG();
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d4acf9d9d087..deeae6d40dad 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3936,7 +3936,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			    unsigned long start, unsigned long end,
-			    struct page *ref_page)
+			    struct page *ref_page, unsigned long zap_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -3988,6 +3988,19 @@  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			continue;
 		}
 
+		if (unlikely(is_swap_special_pte(pte))) {
+			WARN_ON_ONCE(!pte_swp_uffd_wp_special(pte));
+			/*
+			 * Only drop the special swap uffd-wp pte if
+			 * e.g. unmapping a vma or punching a hole (with proper
+			 * lock held so that concurrent page fault won't happen).
+			 */
+			if (zap_flags & ZAP_FLAG_DROP_FILE_UFFD_WP)
+				huge_pte_clear(mm, address, ptep, sz);
+			spin_unlock(ptl);
+			continue;
+		}
+
 		/*
 		 * Migrating hugepage or HWPoisoned hugepage is already
 		 * unmapped and its refcount is dropped, so just clear pte here.
@@ -4039,9 +4052,10 @@  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  struct vm_area_struct *vma, unsigned long start,
-			  unsigned long end, struct page *ref_page)
+			  unsigned long end, struct page *ref_page,
+			  unsigned long zap_flags)
 {
-	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
+	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
 
 	/*
 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
@@ -4057,12 +4071,13 @@  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-			  unsigned long end, struct page *ref_page)
+			  unsigned long end, struct page *ref_page,
+			  unsigned long zap_flags)
 {
 	struct mmu_gather tlb;
 
 	tlb_gather_mmu(&tlb, vma->vm_mm);
-	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
 	tlb_finish_mmu(&tlb);
 }
 
@@ -4117,7 +4132,7 @@  static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
 			unmap_hugepage_range(iter_vma, address,
-					     address + huge_page_size(h), page);
+					     address + huge_page_size(h), page, 0);
 	}
 	i_mmap_unlock_write(mapping);
 }
diff --git a/mm/memory.c b/mm/memory.c
index 766946d3eab0..4bf7f8e83733 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1515,8 +1515,11 @@  static void unmap_single_vma(struct mmu_gather *tlb,
 			 * safe to do nothing in this case.
 			 */
 			if (vma->vm_file) {
+				unsigned long zap_flags = details ?
+				    details->zap_flags : 0;
 				i_mmap_lock_write(vma->vm_file->f_mapping);
-				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+				__unmap_hugepage_range_final(tlb, vma, start, end,
+							     NULL, zap_flags);
 				i_mmap_unlock_write(vma->vm_file->f_mapping);
 			}
 		} else