diff mbox

[v4,1/5] mm: add mkwrite param to vm_insert_mixed()

Message ID 20170721223956.29485-2-ross.zwisler@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ross Zwisler July 21, 2017, 10:39 p.m. UTC
To be able to use the common 4k zero page in DAX we need to have our PTE
fault path look more like our PMD fault path where a PTE entry can be
marked as dirty and writeable as it is first inserted, rather than waiting
for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.

Right now we can rely on having a dax_pfn_mkwrite() call because we can
distinguish between these two cases in do_wp_page():

	case 1: 4k zero page => writable DAX storage
	case 2: read-only DAX storage => writeable DAX storage

This distinction is made by via vm_normal_page().  vm_normal_page() returns
false for the common 4k zero page, though, just as it does for DAX ptes.
Instead of special casing the DAX + 4k zero page case, we will simplify our
DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
get rid of the dax_pfn_mkwrite() helper.  We will instead use
dax_iomap_fault() to handle write-protection faults.

This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
will do the work that was previously done by wp_page_reuse() as part of the
dax_pfn_mkwrite() call path.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/dax/device.c                    |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c |  3 ++-
 drivers/gpu/drm/gma500/framebuffer.c    |  2 +-
 drivers/gpu/drm/msm/msm_gem.c           |  3 ++-
 drivers/gpu/drm/omapdrm/omap_gem.c      |  6 ++++--
 drivers/gpu/drm/ttm/ttm_bo_vm.c         |  2 +-
 fs/dax.c                                |  2 +-
 include/linux/mm.h                      |  2 +-
 mm/memory.c                             | 27 +++++++++++++++++++++------
 9 files changed, 34 insertions(+), 15 deletions(-)

Comments

Dan Williams July 22, 2017, 4:21 p.m. UTC | #1
On Fri, Jul 21, 2017 at 3:39 PM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> To be able to use the common 4k zero page in DAX we need to have our PTE
> fault path look more like our PMD fault path where a PTE entry can be
> marked as dirty and writeable as it is first inserted, rather than waiting
> for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.
>
> Right now we can rely on having a dax_pfn_mkwrite() call because we can
> distinguish between these two cases in do_wp_page():
>
>         case 1: 4k zero page => writable DAX storage
>         case 2: read-only DAX storage => writeable DAX storage
>
> This distinction is made by via vm_normal_page().  vm_normal_page() returns
> false for the common 4k zero page, though, just as it does for DAX ptes.
> Instead of special casing the DAX + 4k zero page case, we will simplify our
> DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
> get rid of the dax_pfn_mkwrite() helper.  We will instead use
> dax_iomap_fault() to handle write-protection faults.
>
> This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
> and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
> will do the work that was previously done by wp_page_reuse() as part of the
> dax_pfn_mkwrite() call path.
>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  drivers/dax/device.c                    |  2 +-
>  drivers/gpu/drm/exynos/exynos_drm_gem.c |  3 ++-
>  drivers/gpu/drm/gma500/framebuffer.c    |  2 +-
>  drivers/gpu/drm/msm/msm_gem.c           |  3 ++-
>  drivers/gpu/drm/omapdrm/omap_gem.c      |  6 ++++--
>  drivers/gpu/drm/ttm/ttm_bo_vm.c         |  2 +-
>  fs/dax.c                                |  2 +-
>  include/linux/mm.h                      |  2 +-
>  mm/memory.c                             | 27 +++++++++++++++++++++------
>  9 files changed, 34 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index e9f3b3e..3973521 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -273,7 +273,7 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
>
>         pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
>
> -       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
> +       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn, false);

Ugh, I generally find bool flags unreadable. They place a tax on
jumping to function definition to recall what true and false mean. If
we want to go this 'add an argument' route can we at least add an enum
like:

enum {
    PTE_MKDIRTY,
    PTE_MKCLEAN,
};

...to differentiate the two cases?
Jan Kara July 24, 2017, 11:15 a.m. UTC | #2
On Sat 22-07-17 09:21:31, Dan Williams wrote:
> On Fri, Jul 21, 2017 at 3:39 PM, Ross Zwisler
> <ross.zwisler@linux.intel.com> wrote:
> > To be able to use the common 4k zero page in DAX we need to have our PTE
> > fault path look more like our PMD fault path where a PTE entry can be
> > marked as dirty and writeable as it is first inserted, rather than waiting
> > for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.
> >
> > Right now we can rely on having a dax_pfn_mkwrite() call because we can
> > distinguish between these two cases in do_wp_page():
> >
> >         case 1: 4k zero page => writable DAX storage
> >         case 2: read-only DAX storage => writeable DAX storage
> >
> > This distinction is made by via vm_normal_page().  vm_normal_page() returns
> > false for the common 4k zero page, though, just as it does for DAX ptes.
> > Instead of special casing the DAX + 4k zero page case, we will simplify our
> > DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
> > get rid of the dax_pfn_mkwrite() helper.  We will instead use
> > dax_iomap_fault() to handle write-protection faults.
> >
> > This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
> > and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
> > will do the work that was previously done by wp_page_reuse() as part of the
> > dax_pfn_mkwrite() call path.
> >
> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > ---
> >  drivers/dax/device.c                    |  2 +-
> >  drivers/gpu/drm/exynos/exynos_drm_gem.c |  3 ++-
> >  drivers/gpu/drm/gma500/framebuffer.c    |  2 +-
> >  drivers/gpu/drm/msm/msm_gem.c           |  3 ++-
> >  drivers/gpu/drm/omapdrm/omap_gem.c      |  6 ++++--
> >  drivers/gpu/drm/ttm/ttm_bo_vm.c         |  2 +-
> >  fs/dax.c                                |  2 +-
> >  include/linux/mm.h                      |  2 +-
> >  mm/memory.c                             | 27 +++++++++++++++++++++------
> >  9 files changed, 34 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> > index e9f3b3e..3973521 100644
> > --- a/drivers/dax/device.c
> > +++ b/drivers/dax/device.c
> > @@ -273,7 +273,7 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
> >
> >         pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
> >
> > -       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
> > +       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn, false);
> 
> Ugh, I generally find bool flags unreadable. They place a tax on
> jumping to function definition to recall what true and false mean. If
> we want to go this 'add an argument' route can we at least add an enum
> like:
> 
> enum {
>     PTE_MKDIRTY,
>     PTE_MKCLEAN,
> };
> 
> ...to differentiate the two cases?

So how I usually deal with this is that I create e.g.:

__vm_insert_mixed() that takes the bool argument, make vm_insert_mixed()
pass false, and vm_insert_mixed_mkwrite() pass true. That way there's no
code duplication, old call sites can stay unchanged, the naming clearly
says what's going on...

								Honza
Jan Kara July 24, 2017, 11:25 a.m. UTC | #3
> @@ -1658,14 +1658,28 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  	if (!pte)
>  		goto out;
>  	retval = -EBUSY;
> -	if (!pte_none(*pte))
> -		goto out_unlock;
> +	if (!pte_none(*pte)) {
> +		if (mkwrite) {
> +			if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))

Is the WARN_ON_ONCE() really appropriate here? Your testcase with private
mappings has triggered this situation if I'm right...

Otherwise the patch looks good to me.

								Honza
Ross Zwisler July 24, 2017, 3:13 p.m. UTC | #4
On Mon, Jul 24, 2017 at 01:15:31PM +0200, Jan Kara wrote:
> On Sat 22-07-17 09:21:31, Dan Williams wrote:
> > On Fri, Jul 21, 2017 at 3:39 PM, Ross Zwisler
> > <ross.zwisler@linux.intel.com> wrote:
> > > To be able to use the common 4k zero page in DAX we need to have our PTE
> > > fault path look more like our PMD fault path where a PTE entry can be
> > > marked as dirty and writeable as it is first inserted, rather than waiting
> > > for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.
> > >
> > > Right now we can rely on having a dax_pfn_mkwrite() call because we can
> > > distinguish between these two cases in do_wp_page():
> > >
> > >         case 1: 4k zero page => writable DAX storage
> > >         case 2: read-only DAX storage => writeable DAX storage
> > >
> > > This distinction is made by via vm_normal_page().  vm_normal_page() returns
> > > false for the common 4k zero page, though, just as it does for DAX ptes.
> > > Instead of special casing the DAX + 4k zero page case, we will simplify our
> > > DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
> > > get rid of the dax_pfn_mkwrite() helper.  We will instead use
> > > dax_iomap_fault() to handle write-protection faults.
> > >
> > > This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
> > > and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
> > > will do the work that was previously done by wp_page_reuse() as part of the
> > > dax_pfn_mkwrite() call path.
> > >
> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > ---
> > >  drivers/dax/device.c                    |  2 +-
> > >  drivers/gpu/drm/exynos/exynos_drm_gem.c |  3 ++-
> > >  drivers/gpu/drm/gma500/framebuffer.c    |  2 +-
> > >  drivers/gpu/drm/msm/msm_gem.c           |  3 ++-
> > >  drivers/gpu/drm/omapdrm/omap_gem.c      |  6 ++++--
> > >  drivers/gpu/drm/ttm/ttm_bo_vm.c         |  2 +-
> > >  fs/dax.c                                |  2 +-
> > >  include/linux/mm.h                      |  2 +-
> > >  mm/memory.c                             | 27 +++++++++++++++++++++------
> > >  9 files changed, 34 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> > > index e9f3b3e..3973521 100644
> > > --- a/drivers/dax/device.c
> > > +++ b/drivers/dax/device.c
> > > @@ -273,7 +273,7 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
> > >
> > >         pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
> > >
> > > -       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
> > > +       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn, false);
> > 
> > Ugh, I generally find bool flags unreadable. They place a tax on
> > jumping to function definition to recall what true and false mean. If
> > we want to go this 'add an argument' route can we at least add an enum
> > like:
> > 
> > enum {
> >     PTE_MKDIRTY,
> >     PTE_MKCLEAN,
> > };
> > 
> > ...to differentiate the two cases?
> 
> So how I usually deal with this is that I create e.g.:
> 
> __vm_insert_mixed() that takes the bool argument, make vm_insert_mixed()
> pass false, and vm_insert_mixed_mkwrite() pass true. That way there's no
> code duplication, old call sites can stay unchanged, the naming clearly
> says what's going on...

Ah, that does seem cleaner.  I'll try that for v5.
Ross Zwisler July 24, 2017, 3:23 p.m. UTC | #5
On Mon, Jul 24, 2017 at 01:25:30PM +0200, Jan Kara wrote:
> > @@ -1658,14 +1658,28 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> >  	if (!pte)
> >  		goto out;
> >  	retval = -EBUSY;
> > -	if (!pte_none(*pte))
> > -		goto out_unlock;
> > +	if (!pte_none(*pte)) {
> > +		if (mkwrite) {
> > +			if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
> 
> Is the WARN_ON_ONCE() really appropriate here? Your testcase with private
> mappings has triggered this situation if I'm right...

Yep, I think this WARN_ON_ONCE() is correct.  The test with private mappings
had collisions between read-only DAX mappings which were being faulted in via
insert_pfn(), and read/write COW page cache mappings which were being faulted
in by wp_page_copy().

I was hitting a false-positive warning when I had the WARN_ON_ONCE() in
insert_pfn() outside of the mkwrite case, i.e.:

	if (!pte_none(*pte)) {
		if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
			goto out_unlock;
		if (mkwrite) {
			entry = *pte;
			goto out_mkwrite;
		} else
			goto out_unlock;
	}

This was triggering when one thread was faulting in a read-only DAX mapping
when another thread had already faulted in a read-write COW page cache page.

The patches I sent out have the warning in the mkwrite case, which would mean
that we were getting a fault for a read/write PTE in insert_pfn() and the PFN
didn't match what was already in the PTE.

This can't ever happen in the private mapping case because we will never
install a read/write PTE for normal storage, only for COW page cache pages.
Essentially I don't think we should ever be able to hit this warning, and if
we do I'd like to get the bug report so that I can track down how it was
happening and make sure that it's safe.  It is in the mkwrite path of
insert_pfn() which is currently only used by the DAX code.

Does that make sense to you, or would you recommend leaving it out?  (If so,
why?)
Jan Kara July 24, 2017, 3:59 p.m. UTC | #6
On Mon 24-07-17 09:23:57, Ross Zwisler wrote:
> On Mon, Jul 24, 2017 at 01:25:30PM +0200, Jan Kara wrote:
> > > @@ -1658,14 +1658,28 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> > >  	if (!pte)
> > >  		goto out;
> > >  	retval = -EBUSY;
> > > -	if (!pte_none(*pte))
> > > -		goto out_unlock;
> > > +	if (!pte_none(*pte)) {
> > > +		if (mkwrite) {
> > > +			if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
> > 
> > Is the WARN_ON_ONCE() really appropriate here? Your testcase with private
> > mappings has triggered this situation if I'm right...
> 
> Yep, I think this WARN_ON_ONCE() is correct.  The test with private mappings
> had collisions between read-only DAX mappings which were being faulted in via
> insert_pfn(), and read/write COW page cache mappings which were being faulted
> in by wp_page_copy().
> 
> I was hitting a false-positive warning when I had the WARN_ON_ONCE() in
> insert_pfn() outside of the mkwrite case, i.e.:
> 
> 	if (!pte_none(*pte)) {
> 		if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
> 			goto out_unlock;
> 		if (mkwrite) {
> 			entry = *pte;
> 			goto out_mkwrite;
> 		} else
> 			goto out_unlock;
> 	}
> 
> This was triggering when one thread was faulting in a read-only DAX mapping
> when another thread had already faulted in a read-write COW page cache page.
> 
> The patches I sent out have the warning in the mkwrite case, which would mean
> that we were getting a fault for a read/write PTE in insert_pfn() and the PFN
> didn't match what was already in the PTE.
> 
> This can't ever happen in the private mapping case because we will never
> install a read/write PTE for normal storage, only for COW page cache pages.
> Essentially I don't think we should ever be able to hit this warning, and if
> we do I'd like to get the bug report so that I can track down how it was
> happening and make sure that it's safe.  It is in the mkwrite path of
> insert_pfn() which is currently only used by the DAX code.
> 
> Does that make sense to you, or would you recommend leaving it out?  (If so,
> why?)

Ah, OK, makes sense. So feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
diff mbox

Patch

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index e9f3b3e..3973521 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -273,7 +273,7 @@  static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
+	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn, false);
 
 	if (rc == -ENOMEM)
 		return VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index c23479b..bfa6648 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -466,7 +466,8 @@  int exynos_drm_gem_fault(struct vm_fault *vmf)
 	}
 
 	pfn = page_to_pfn(exynos_gem->pages[page_offset]);
-	ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
+	ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV),
+			false);
 
 out:
 	switch (ret) {
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 7da70b6..6dd865f 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -134,7 +134,7 @@  static int psbfb_vm_fault(struct vm_fault *vmf)
 		pfn = (phys_addr >> PAGE_SHIFT);
 
 		ret = vm_insert_mixed(vma, address,
-				__pfn_to_pfn_t(pfn, PFN_DEV));
+				__pfn_to_pfn_t(pfn, PFN_DEV), false);
 		if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0)))
 			break;
 		else if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 65f3554..c187fd1 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -249,7 +249,8 @@  int msm_gem_fault(struct vm_fault *vmf)
 	VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
 			pfn, pfn << PAGE_SHIFT);
 
-	ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
+	ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV),
+			false);
 
 out_unlock:
 	mutex_unlock(&msm_obj->lock);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 5c5c86d..26eebcd 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -393,7 +393,8 @@  static int fault_1d(struct drm_gem_object *obj,
 	VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
 			pfn, pfn << PAGE_SHIFT);
 
-	return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
+	return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV),
+			false);
 }
 
 /* Special handling for the case of faulting in 2d tiled buffers */
@@ -486,7 +487,8 @@  static int fault_2d(struct drm_gem_object *obj,
 			pfn, pfn << PAGE_SHIFT);
 
 	for (i = n; i > 0; i--) {
-		vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV));
+		vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV),
+				false);
 		pfn += priv->usergart[fmt].stride_pfn;
 		vaddr += PAGE_SIZE * m;
 	}
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index b442d12..e85bfa7 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -248,7 +248,7 @@  static int ttm_bo_vm_fault(struct vm_fault *vmf)
 
 		if (vma->vm_flags & VM_MIXEDMAP)
 			ret = vm_insert_mixed(&cvma, address,
-					__pfn_to_pfn_t(pfn, PFN_DEV));
+					__pfn_to_pfn_t(pfn, PFN_DEV), false);
 		else
 			ret = vm_insert_pfn(&cvma, address, pfn);
 
diff --git a/fs/dax.c b/fs/dax.c
index 306c2b6..c844a51 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -899,7 +899,7 @@  static int dax_insert_mapping(struct address_space *mapping,
 	*entryp = ret;
 
 	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	return vm_insert_mixed(vma, vaddr, pfn);
+	return vm_insert_mixed(vma, vaddr, pfn, false);
 }
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5..3eabc40 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2292,7 +2292,7 @@  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn);
+			pfn_t pfn, bool mkwrite);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 
diff --git a/mm/memory.c b/mm/memory.c
index 0e517be..d351911 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1646,7 +1646,7 @@  int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn, pgprot_t prot)
+			pfn_t pfn, pgprot_t prot, bool mkwrite)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
@@ -1658,14 +1658,28 @@  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
+	if (!pte_none(*pte)) {
+		if (mkwrite) {
+			if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+				goto out_unlock;
+			entry = *pte;
+			goto out_mkwrite;
+		} else
+			goto out_unlock;
+	}
 
 	/* Ok, finally just insert the thing.. */
 	if (pfn_t_devmap(pfn))
 		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
 	else
 		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+	if (mkwrite) {
+		entry = pte_mkyoung(entry);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	}
+
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
@@ -1736,14 +1750,15 @@  int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
 	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
-	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+			false);
 
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn_prot);
 
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn)
+			pfn_t pfn, bool mkwrite)
 {
 	pgprot_t pgprot = vma->vm_page_prot;
 
@@ -1772,7 +1787,7 @@  int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 		page = pfn_to_page(pfn_t_to_pfn(pfn));
 		return insert_page(vma, addr, page, pgprot);
 	}
-	return insert_pfn(vma, addr, pfn, pgprot);
+	return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
 }
 EXPORT_SYMBOL(vm_insert_mixed);