diff mbox series

[v2,4/4] hugetlb: Do early cow when page pinned on src mm

Message ID 20210204145033.136755-5-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/hugetlb: Early cow on fork, and a few cleanups | expand

Commit Message

Peter Xu Feb. 4, 2021, 2:50 p.m. UTC
This is the last missing piece of the COW-during-fork effort when there're
pinned pages found.  One can reference 70e806e4e645 ("mm: Do early cow for
pinned pages during fork() for ptes", 2020-09-27) for more information, since
we do similar things here rather than pte this time, but just for hugetlb.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 mm/hugetlb.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 5 deletions(-)

Comments

Mike Kravetz Feb. 4, 2021, 11:25 p.m. UTC | #1
On 2/4/21 6:50 AM, Peter Xu wrote:
> This is the last missing piece of the COW-during-fork effort when there're
> pinned pages found.  One can reference 70e806e4e645 ("mm: Do early cow for
> pinned pages during fork() for ptes", 2020-09-27) for more information, since
> we do similar things here rather than pte this time, but just for hugetlb.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  mm/hugetlb.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 56 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 9e6ea96bf33b..5793936e00ef 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3734,11 +3734,27 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
>  		return false;
>  }
>  
> +static void
> +hugetlb_copy_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
> +		  struct page *old_page, struct page *new_page)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	unsigned int psize = pages_per_huge_page(h);
> +
> +	copy_user_huge_page(new_page, old_page, addr, vma, psize);

copy_user_huge_page calls cond_resched() and has might_sleep().  Imagine
the time it takes to copy 1G.  Usually called without holding locks, but
this new code is calling it with ptl locks held.  The copy should be done
outside the ptl, but you will need the ptl to update the pte/rmap.  So,
doing all this within one neat helper like this may not be possible.

> +	__SetPageUptodate(new_page);
> +	ClearPagePrivate(new_page);
> +	set_page_huge_active(new_page);

Code to replace the above ClearPagePrivate and set_page_huge_active is
in Andrew's tree.  With changes in Andrew's tree, this would be:

	ClearHPageRestoreReserve(new_page);
	SetHPageMigratable(new_page);

Ideally, the SetHPageMigratable would be done after the set_pte and add_rmap
so the page does not get migrated before these operations.  However, this
can not happen since we are holding the ptl.  So, no problem here.  If code
is restructured to call copy_user_huge_page outside ptl, keep this in mind.

Also, technically ClearHPageRestoreReserve is not needed as it would not be
set by alloc_huge_page because we did not consume a reserve.  However, better
to leave in place in case someone wants to use helper for something else.

> +	set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
> +	hugepage_add_new_anon_rmap(new_page, vma, addr);
> +	hugetlb_count_add(psize, vma->vm_mm);
> +}
> +
>  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>  			    struct vm_area_struct *vma)
>  {
>  	pte_t *src_pte, *dst_pte, entry, dst_entry;
> -	struct page *ptepage;
> +	struct page *ptepage, *prealloc = NULL;
>  	unsigned long addr;
>  	int cow;
>  	struct hstate *h = hstate_vma(vma);
> @@ -3787,7 +3803,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>  		dst_entry = huge_ptep_get(dst_pte);
>  		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
>  			continue;
> -
> +again:
>  		dst_ptl = huge_pte_lock(h, dst, dst_pte);
>  		src_ptl = huge_pte_lockptr(h, src, src_pte);
>  		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> @@ -3816,6 +3832,39 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>  			}
>  			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
>  		} else {
> +			entry = huge_ptep_get(src_pte);
> +			ptepage = pte_page(entry);
> +			get_page(ptepage);
> +
> +			/*
> +			 * This is a rare case where we see pinned hugetlb
> +			 * pages while they're prone to COW.  We need to do the
> +			 * COW earlier during fork.
> +			 *
> +			 * When pre-allocating the page we need to be without
> +			 * all the locks since we could sleep when allocate.
> +			 */
> +			if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
> +				if (!prealloc) {
> +					put_page(ptepage);
> +					spin_unlock(src_ptl);
> +					spin_unlock(dst_ptl);
> +					prealloc = alloc_huge_page(vma, addr, 1);
> +					if (!prealloc) {

alloc_huge_page will return error codes, so you need to check IS_ERR(prealloc)
not just NULL.
Peter Xu Feb. 5, 2021, 1:43 a.m. UTC | #2
On Thu, Feb 04, 2021 at 03:25:37PM -0800, Mike Kravetz wrote:
> On 2/4/21 6:50 AM, Peter Xu wrote:
> > This is the last missing piece of the COW-during-fork effort when there're
> > pinned pages found.  One can reference 70e806e4e645 ("mm: Do early cow for
> > pinned pages during fork() for ptes", 2020-09-27) for more information, since
> > we do similar things here rather than pte this time, but just for hugetlb.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  mm/hugetlb.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 56 insertions(+), 5 deletions(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 9e6ea96bf33b..5793936e00ef 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -3734,11 +3734,27 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
> >  		return false;
> >  }
> >  
> > +static void
> > +hugetlb_copy_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
> > +		  struct page *old_page, struct page *new_page)
> > +{
> > +	struct hstate *h = hstate_vma(vma);
> > +	unsigned int psize = pages_per_huge_page(h);
> > +
> > +	copy_user_huge_page(new_page, old_page, addr, vma, psize);
> 
> copy_user_huge_page calls cond_resched() and has might_sleep().  Imagine
> the time it takes to copy 1G.  Usually called without holding locks, but
> this new code is calling it with ptl locks held.  The copy should be done
> outside the ptl, but you will need the ptl to update the pte/rmap.  So,
> doing all this within one neat helper like this may not be possible.

Right, I'll move the copy outside, thanks for spotting this.

> 
> > +	__SetPageUptodate(new_page);
> > +	ClearPagePrivate(new_page);
> > +	set_page_huge_active(new_page);
> 
> Code to replace the above ClearPagePrivate and set_page_huge_active is
> in Andrew's tree.  With changes in Andrew's tree, this would be:
> 
> 	ClearHPageRestoreReserve(new_page);
> 	SetHPageMigratable(new_page);

Indeed these names are much better than using the default ones.  At the
meantime I'll rebase to linux-next/akpm.  Sorry it's always not easy for me to
find the right branch...

> 
> Ideally, the SetHPageMigratable would be done after the set_pte and add_rmap
> so the page does not get migrated before these operations.  However, this
> can not happen since we are holding the ptl.  So, no problem here.  If code
> is restructured to call copy_user_huge_page outside ptl, keep this in mind.
> 
> Also, technically ClearHPageRestoreReserve is not needed as it would not be
> set by alloc_huge_page because we did not consume a reserve.  However, better
> to leave in place in case someone wants to use helper for something else.

OK, I'll keep it for clearness.

> 
> > +	set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
> > +	hugepage_add_new_anon_rmap(new_page, vma, addr);
> > +	hugetlb_count_add(psize, vma->vm_mm);
> > +}
> > +
> >  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> >  			    struct vm_area_struct *vma)
> >  {
> >  	pte_t *src_pte, *dst_pte, entry, dst_entry;
> > -	struct page *ptepage;
> > +	struct page *ptepage, *prealloc = NULL;
> >  	unsigned long addr;
> >  	int cow;
> >  	struct hstate *h = hstate_vma(vma);
> > @@ -3787,7 +3803,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> >  		dst_entry = huge_ptep_get(dst_pte);
> >  		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
> >  			continue;
> > -
> > +again:
> >  		dst_ptl = huge_pte_lock(h, dst, dst_pte);
> >  		src_ptl = huge_pte_lockptr(h, src, src_pte);
> >  		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

Side question: Mike, do you know why we need this lock_nested()?  Could the src
lock be taken due to any reason already?  It confused me when I read the chunk.

> > @@ -3816,6 +3832,39 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> >  			}
> >  			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
> >  		} else {
> > +			entry = huge_ptep_get(src_pte);
> > +			ptepage = pte_page(entry);
> > +			get_page(ptepage);
> > +
> > +			/*
> > +			 * This is a rare case where we see pinned hugetlb
> > +			 * pages while they're prone to COW.  We need to do the
> > +			 * COW earlier during fork.
> > +			 *
> > +			 * When pre-allocating the page we need to be without
> > +			 * all the locks since we could sleep when allocate.
> > +			 */
> > +			if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
> > +				if (!prealloc) {
> > +					put_page(ptepage);
> > +					spin_unlock(src_ptl);
> > +					spin_unlock(dst_ptl);
> > +					prealloc = alloc_huge_page(vma, addr, 1);
> > +					if (!prealloc) {
> 
> alloc_huge_page will return error codes, so you need to check IS_ERR(prealloc)
> not just NULL.

Definitely, I'll fix.

Thanks,
Mike Kravetz Feb. 5, 2021, 5:11 a.m. UTC | #3
On 2/4/21 5:43 PM, Peter Xu wrote:
> On Thu, Feb 04, 2021 at 03:25:37PM -0800, Mike Kravetz wrote:
>> On 2/4/21 6:50 AM, Peter Xu wrote:
>>> This is the last missing piece of the COW-during-fork effort when there're
>>> pinned pages found.  One can reference 70e806e4e645 ("mm: Do early cow for
>>> pinned pages during fork() for ptes", 2020-09-27) for more information, since
>>> we do similar things here rather than pte this time, but just for hugetlb.
>>>
>>> Signed-off-by: Peter Xu <peterx@redhat.com>
>>> ---
>>>  mm/hugetlb.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
>>>  1 file changed, 56 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 9e6ea96bf33b..5793936e00ef 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> +	__SetPageUptodate(new_page);
>>> +	ClearPagePrivate(new_page);
>>> +	set_page_huge_active(new_page);
>>
>> Code to replace the above ClearPagePrivate and set_page_huge_active is
>> in Andrew's tree.  With changes in Andrew's tree, this would be:
>>
>> 	ClearHPageRestoreReserve(new_page);
>> 	SetHPageMigratable(new_page);
> 
> Indeed these names are much better than using the default ones.  At the
> meantime I'll rebase to linux-next/akpm.  Sorry it's always not easy for me to
> find the right branch...

No worries.  I only know because I recently changed these.

...
>>> @@ -3787,7 +3803,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>>>  		dst_entry = huge_ptep_get(dst_pte);
>>>  		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
>>>  			continue;
>>> -
>>> +again:
>>>  		dst_ptl = huge_pte_lock(h, dst, dst_pte);
>>>  		src_ptl = huge_pte_lockptr(h, src, src_pte);
>>>  		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> 
> Side question: Mike, do you know why we need this lock_nested()?  Could the src
> lock be taken due to any reason already?  It confused me when I read the chunk.

I see that it was added with commit 4647875819aa.  That was when huge pages
used the single per-mm ptl.  Lockdep seemed to complain about taking
&mm->page_table_lock twice.   Certainly, source and destination mm can not
be the same.  Right?  I do not have the full history, but it 'looks' like
lockdep might have been confused and this was added to keep it quiet.

BTW - Copy page range for 'normal' pages has the same spin_lock_nested().
Zhang, Wei Feb. 5, 2021, 2:58 p.m. UTC | #4
Hi Peter,

Gal and I worked together. We tested the patch v2 and can confirm it is working as intended.

Thank you very much for your quick response!

Sincerely,

Wei Zhang


On 2/4/21, 6:51 AM, "Peter Xu" <peterx@redhat.com> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    This is the last missing piece of the COW-during-fork effort when there're
    pinned pages found.  One can reference 70e806e4e645 ("mm: Do early cow for
    pinned pages during fork() for ptes", 2020-09-27) for more information, since
    we do similar things here rather than pte this time, but just for hugetlb.

    Signed-off-by: Peter Xu <peterx@redhat.com>
    ---
     mm/hugetlb.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
     1 file changed, 56 insertions(+), 5 deletions(-)

    diff --git a/mm/hugetlb.c b/mm/hugetlb.c
    index 9e6ea96bf33b..5793936e00ef 100644
    --- a/mm/hugetlb.c
    +++ b/mm/hugetlb.c
    @@ -3734,11 +3734,27 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
                    return false;
     }

    +static void
    +hugetlb_copy_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
    +                 struct page *old_page, struct page *new_page)
    +{
    +       struct hstate *h = hstate_vma(vma);
    +       unsigned int psize = pages_per_huge_page(h);
    +
    +       copy_user_huge_page(new_page, old_page, addr, vma, psize);
    +       __SetPageUptodate(new_page);
    +       ClearPagePrivate(new_page);
    +       set_page_huge_active(new_page);
    +       set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
    +       hugepage_add_new_anon_rmap(new_page, vma, addr);
    +       hugetlb_count_add(psize, vma->vm_mm);
    +}
    +
     int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                                struct vm_area_struct *vma)
     {
            pte_t *src_pte, *dst_pte, entry, dst_entry;
    -       struct page *ptepage;
    +       struct page *ptepage, *prealloc = NULL;
            unsigned long addr;
            int cow;
            struct hstate *h = hstate_vma(vma);
    @@ -3787,7 +3803,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                    dst_entry = huge_ptep_get(dst_pte);
                    if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
                            continue;
    -
    +again:
                    dst_ptl = huge_pte_lock(h, dst, dst_pte);
                    src_ptl = huge_pte_lockptr(h, src, src_pte);
                    spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
    @@ -3816,6 +3832,39 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            }
                            set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
                    } else {
    +                       entry = huge_ptep_get(src_pte);
    +                       ptepage = pte_page(entry);
    +                       get_page(ptepage);
    +
    +                       /*
    +                        * This is a rare case where we see pinned hugetlb
    +                        * pages while they're prone to COW.  We need to do the
    +                        * COW earlier during fork.
    +                        *
    +                        * When pre-allocating the page we need to be without
    +                        * all the locks since we could sleep when allocate.
    +                        */
    +                       if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
    +                               if (!prealloc) {
    +                                       put_page(ptepage);
    +                                       spin_unlock(src_ptl);
    +                                       spin_unlock(dst_ptl);
    +                                       prealloc = alloc_huge_page(vma, addr, 1);
    +                                       if (!prealloc) {
    +                                               ret = -ENOMEM;
    +                                               break;
    +                                       }
    +                                       goto again;
    +                               }
    +                               hugetlb_copy_page(vma, dst_pte, addr, ptepage,
    +                                                 prealloc);
    +                               put_page(ptepage);
    +                               spin_unlock(src_ptl);
    +                               spin_unlock(dst_ptl);
    +                               prealloc = NULL;
    +                               continue;
    +                       }
    +
                            if (cow) {
                                    /*
                                     * No need to notify as we are downgrading page
    @@ -3826,9 +3875,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                                     */
                                    huge_ptep_set_wrprotect(src, addr, src_pte);
                            }
    -                       entry = huge_ptep_get(src_pte);
    -                       ptepage = pte_page(entry);
    -                       get_page(ptepage);
    +
                            page_dup_rmap(ptepage, true);
                            set_huge_pte_at(dst, addr, dst_pte, entry);
                            hugetlb_count_add(pages_per_huge_page(h), dst);
    @@ -3842,6 +3889,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
            else
                    i_mmap_unlock_read(mapping);

    +       /* Free the preallocated page if not used at last */
    +       if (prealloc)
    +               put_page(prealloc);
    +
            return ret;
     }

    --
    2.26.2
Peter Xu Feb. 5, 2021, 3:51 p.m. UTC | #5
On Fri, Feb 05, 2021 at 02:58:33PM +0000, Zhang, Wei wrote:
> Hi Peter,

Hi, Wei,

> 
> Gal and I worked together. We tested the patch v2 and can confirm it is working as intended.
> 
> Thank you very much for your quick response!

My thanks too on the quick testing!  Sorry that I'll need to post v3; please
feel free to hold off the testing until I got some r-bs with the new version.

Thanks,
Peter Xu Feb. 5, 2021, 4:05 p.m. UTC | #6
On Thu, Feb 04, 2021 at 09:11:24PM -0800, Mike Kravetz wrote:

[...]

> >>> @@ -3787,7 +3803,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> >>>  		dst_entry = huge_ptep_get(dst_pte);
> >>>  		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
> >>>  			continue;
> >>> -
> >>> +again:
> >>>  		dst_ptl = huge_pte_lock(h, dst, dst_pte);
> >>>  		src_ptl = huge_pte_lockptr(h, src, src_pte);
> >>>  		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> > 
> > Side question: Mike, do you know why we need this lock_nested()?  Could the src
> > lock be taken due to any reason already?  It confused me when I read the chunk.
> 
> I see that it was added with commit 4647875819aa.  That was when huge pages
> used the single per-mm ptl.  Lockdep seemed to complain about taking
> &mm->page_table_lock twice.   Certainly, source and destination mm can not
> be the same.  Right?

Right, at least that's my understanding..

> I do not have the full history, but it 'looks' like
> lockdep might have been confused and this was added to keep it quiet.
> 
> BTW - Copy page range for 'normal' pages has the same spin_lock_nested().

Yes.  I'll need to take the same lock in v3, so I think I'll just follow.

Thanks,
Gal Pressman Feb. 7, 2021, 9:09 a.m. UTC | #7
On 05/02/2021 17:51, Peter Xu wrote:
> On Fri, Feb 05, 2021 at 02:58:33PM +0000, Zhang, Wei wrote:
>> Hi Peter,
> 
> Hi, Wei,
> 
>>
>> Gal and I worked together. We tested the patch v2 and can confirm it is working as intended.
>>
>> Thank you very much for your quick response!
> 
> My thanks too on the quick testing!  Sorry that I'll need to post v3; please
> feel free to hold off the testing until I got some r-bs with the new version.

BTW, it might already be fixed in v3, but I encountered a compilation warning
with this series:

mm/memory.c: In function ‘copy_present_page’:
mm/memory.c:800:20: warning: unused variable ‘src_mm’ [-Wunused-variable]
  struct mm_struct *src_mm = src_vma->vm_mm;
                    ^~~~~~
Peter Xu Feb. 7, 2021, 3:31 p.m. UTC | #8
On Sun, Feb 07, 2021 at 11:09:29AM +0200, Gal Pressman wrote:
> On 05/02/2021 17:51, Peter Xu wrote:
> > On Fri, Feb 05, 2021 at 02:58:33PM +0000, Zhang, Wei wrote:
> >> Hi Peter,
> > 
> > Hi, Wei,
> > 
> >>
> >> Gal and I worked together. We tested the patch v2 and can confirm it is working as intended.
> >>
> >> Thank you very much for your quick response!
> > 
> > My thanks too on the quick testing!  Sorry that I'll need to post v3; please
> > feel free to hold off the testing until I got some r-bs with the new version.
> 
> BTW, it might already be fixed in v3, but I encountered a compilation warning
> with this series:
> 
> mm/memory.c: In function ‘copy_present_page’:
> mm/memory.c:800:20: warning: unused variable ‘src_mm’ [-Wunused-variable]
>   struct mm_struct *src_mm = src_vma->vm_mm;
>                     ^~~~~~

Not yet, thanks for raising this, Gal.  I'll wait for some more review feedback
on v3 and send a new version with it fixed.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9e6ea96bf33b..5793936e00ef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3734,11 +3734,27 @@  static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
 		return false;
 }
 
+static void
+hugetlb_copy_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+		  struct page *old_page, struct page *new_page)
+{
+	struct hstate *h = hstate_vma(vma);
+	unsigned int psize = pages_per_huge_page(h);
+
+	copy_user_huge_page(new_page, old_page, addr, vma, psize);
+	__SetPageUptodate(new_page);
+	ClearPagePrivate(new_page);
+	set_page_huge_active(new_page);
+	set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
+	hugepage_add_new_anon_rmap(new_page, vma, addr);
+	hugetlb_count_add(psize, vma->vm_mm);
+}
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *vma)
 {
 	pte_t *src_pte, *dst_pte, entry, dst_entry;
-	struct page *ptepage;
+	struct page *ptepage, *prealloc = NULL;
 	unsigned long addr;
 	int cow;
 	struct hstate *h = hstate_vma(vma);
@@ -3787,7 +3803,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		dst_entry = huge_ptep_get(dst_pte);
 		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
 			continue;
-
+again:
 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
 		src_ptl = huge_pte_lockptr(h, src, src_pte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -3816,6 +3832,39 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			}
 			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 		} else {
+			entry = huge_ptep_get(src_pte);
+			ptepage = pte_page(entry);
+			get_page(ptepage);
+
+			/*
+			 * This is a rare case where we see pinned hugetlb
+			 * pages while they're prone to COW.  We need to do the
+			 * COW earlier during fork.
+			 *
+			 * When pre-allocating the page we need to be without
+			 * all the locks since we could sleep when allocate.
+			 */
+			if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+				if (!prealloc) {
+					put_page(ptepage);
+					spin_unlock(src_ptl);
+					spin_unlock(dst_ptl);
+					prealloc = alloc_huge_page(vma, addr, 1);
+					if (!prealloc) {
+						ret = -ENOMEM;
+						break;
+					}
+					goto again;
+				}
+				hugetlb_copy_page(vma, dst_pte, addr, ptepage,
+						  prealloc);
+				put_page(ptepage);
+				spin_unlock(src_ptl);
+				spin_unlock(dst_ptl);
+				prealloc = NULL;
+				continue;
+			}
+
 			if (cow) {
 				/*
 				 * No need to notify as we are downgrading page
@@ -3826,9 +3875,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);
 			}
-			entry = huge_ptep_get(src_pte);
-			ptepage = pte_page(entry);
-			get_page(ptepage);
+
 			page_dup_rmap(ptepage, true);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 			hugetlb_count_add(pages_per_huge_page(h), dst);
@@ -3842,6 +3889,10 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	else
 		i_mmap_unlock_read(mapping);
 
+	/* Free the preallocated page if not used at last */
+	if (prealloc)
+		put_page(prealloc);
+
 	return ret;
 }