diff mbox

[16/20] mm: Provide helper for finishing mkwrite faults

Message ID 1474992504-20133-17-git-send-email-jack@suse.cz (mailing list archive)
State Not Applicable
Headers show

Commit Message

Jan Kara Sept. 27, 2016, 4:08 p.m. UTC
Provide a helper function for finishing write faults due to PTE being
read-only. The helper will be used by DAX to avoid the need of
complicating generic MM code with DAX locking specifics.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/mm.h |  1 +
 mm/memory.c        | 65 +++++++++++++++++++++++++++++++-----------------------
 2 files changed, 39 insertions(+), 27 deletions(-)

Comments

Ross Zwisler Oct. 18, 2016, 6:35 p.m. UTC | #1
On Tue, Sep 27, 2016 at 06:08:20PM +0200, Jan Kara wrote:
> Provide a helper function for finishing write faults due to PTE being
> read-only. The helper will be used by DAX to avoid the need of
> complicating generic MM code with DAX locking specifics.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/linux/mm.h |  1 +
>  mm/memory.c        | 65 +++++++++++++++++++++++++++++++-----------------------
>  2 files changed, 39 insertions(+), 27 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1055f2ece80d..e5a014be8932 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -617,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
>  		struct page *page);
>  int finish_fault(struct vm_fault *vmf);
> +int finish_mkwrite_fault(struct vm_fault *vmf);
>  #endif
>  
>  /*
> diff --git a/mm/memory.c b/mm/memory.c
> index f49e736d6a36..8c8cb7f2133e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2266,6 +2266,36 @@ oom:
>  	return VM_FAULT_OOM;
>  }
>  
> +/**
> + * finish_mkrite_fault - finish page fault making PTE writeable once the page
      finish_mkwrite_fault

> @@ -2315,26 +2335,17 @@ static int wp_page_shared(struct vm_fault *vmf)
>  			put_page(vmf->page);
>  			return tmp;
>  		}
> -		/*
> -		 * Since we dropped the lock we need to revalidate
> -		 * the PTE as someone else may have changed it.  If
> -		 * they did, we just return, as we can count on the
> -		 * MMU to tell us if they didn't also make it writable.
> -		 */
> -		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> -						vmf->address, &vmf->ptl);
> -		if (!pte_same(*vmf->pte, vmf->orig_pte)) {
> +		tmp = finish_mkwrite_fault(vmf);
> +		if (unlikely(!tmp || (tmp &
> +				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {

The 'tmp' return from finish_mkwrite_fault() can only be 0 or VM_FAULT_WRITE.
I think this test should just be 

		if (unlikely(!tmp)) {

With that and the small spelling fix:

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Jan Kara Oct. 19, 2016, 7:16 a.m. UTC | #2
On Tue 18-10-16 12:35:25, Ross Zwisler wrote:
> On Tue, Sep 27, 2016 at 06:08:20PM +0200, Jan Kara wrote:
> > Provide a helper function for finishing write faults due to PTE being
> > read-only. The helper will be used by DAX to avoid the need of
> > complicating generic MM code with DAX locking specifics.
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  include/linux/mm.h |  1 +
> >  mm/memory.c        | 65 +++++++++++++++++++++++++++++++-----------------------
> >  2 files changed, 39 insertions(+), 27 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 1055f2ece80d..e5a014be8932 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -617,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> >  int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
> >  		struct page *page);
> >  int finish_fault(struct vm_fault *vmf);
> > +int finish_mkwrite_fault(struct vm_fault *vmf);
> >  #endif
> >  
> >  /*
> > diff --git a/mm/memory.c b/mm/memory.c
> > index f49e736d6a36..8c8cb7f2133e 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2266,6 +2266,36 @@ oom:
> >  	return VM_FAULT_OOM;
> >  }
> >  
> > +/**
> > + * finish_mkrite_fault - finish page fault making PTE writeable once the page
>       finish_mkwrite_fault

Fixed, thanks.

> > @@ -2315,26 +2335,17 @@ static int wp_page_shared(struct vm_fault *vmf)
> >  			put_page(vmf->page);
> >  			return tmp;
> >  		}
> > -		/*
> > -		 * Since we dropped the lock we need to revalidate
> > -		 * the PTE as someone else may have changed it.  If
> > -		 * they did, we just return, as we can count on the
> > -		 * MMU to tell us if they didn't also make it writable.
> > -		 */
> > -		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > -						vmf->address, &vmf->ptl);
> > -		if (!pte_same(*vmf->pte, vmf->orig_pte)) {
> > +		tmp = finish_mkwrite_fault(vmf);
> > +		if (unlikely(!tmp || (tmp &
> > +				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> 
> The 'tmp' return from finish_mkwrite_fault() can only be 0 or VM_FAULT_WRITE.
> I think this test should just be 
> 
> 		if (unlikely(!tmp)) {

Right, finish_mkwrite_fault() cannot currently throw other errors than
"retry needed" which is indicated by tmp == 0. However I'd prefer to keep
symmetry with finish_fault() handler which can throw other errors and
better be prepared to handle them from finish_mkwrite_fault() as well.

								Honza
Ross Zwisler Oct. 19, 2016, 5:21 p.m. UTC | #3
On Wed, Oct 19, 2016 at 09:16:00AM +0200, Jan Kara wrote:
> On Tue 18-10-16 12:35:25, Ross Zwisler wrote:
> > On Tue, Sep 27, 2016 at 06:08:20PM +0200, Jan Kara wrote:
> > > Provide a helper function for finishing write faults due to PTE being
> > > read-only. The helper will be used by DAX to avoid the need of
> > > complicating generic MM code with DAX locking specifics.
> > > 
> > > Signed-off-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  include/linux/mm.h |  1 +
> > >  mm/memory.c        | 65 +++++++++++++++++++++++++++++++-----------------------
> > >  2 files changed, 39 insertions(+), 27 deletions(-)
> > > 
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 1055f2ece80d..e5a014be8932 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -617,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> > >  int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
> > >  		struct page *page);
> > >  int finish_fault(struct vm_fault *vmf);
> > > +int finish_mkwrite_fault(struct vm_fault *vmf);
> > >  #endif
> > >  
> > >  /*
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index f49e736d6a36..8c8cb7f2133e 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -2266,6 +2266,36 @@ oom:
> > >  	return VM_FAULT_OOM;
> > >  }
> > >  
> > > +/**
> > > + * finish_mkrite_fault - finish page fault making PTE writeable once the page
> >       finish_mkwrite_fault
> 
> Fixed, thanks.
> 
> > > @@ -2315,26 +2335,17 @@ static int wp_page_shared(struct vm_fault *vmf)
> > >  			put_page(vmf->page);
> > >  			return tmp;
> > >  		}
> > > -		/*
> > > -		 * Since we dropped the lock we need to revalidate
> > > -		 * the PTE as someone else may have changed it.  If
> > > -		 * they did, we just return, as we can count on the
> > > -		 * MMU to tell us if they didn't also make it writable.
> > > -		 */
> > > -		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > > -						vmf->address, &vmf->ptl);
> > > -		if (!pte_same(*vmf->pte, vmf->orig_pte)) {
> > > +		tmp = finish_mkwrite_fault(vmf);
> > > +		if (unlikely(!tmp || (tmp &
> > > +				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> > 
> > The 'tmp' return from finish_mkwrite_fault() can only be 0 or VM_FAULT_WRITE.
> > I think this test should just be 
> > 
> > 		if (unlikely(!tmp)) {
> 
> Right, finish_mkwrite_fault() cannot currently throw other errors than
> "retry needed" which is indicated by tmp == 0. However I'd prefer to keep
> symmetry with finish_fault() handler which can throw other errors and
> better be prepared to handle them from finish_mkwrite_fault() as well.

Fair enough.  You can add:

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Jan Kara Oct. 20, 2016, 8:48 a.m. UTC | #4
On Wed 19-10-16 11:21:52, Ross Zwisler wrote:
> On Wed, Oct 19, 2016 at 09:16:00AM +0200, Jan Kara wrote:
> > > > @@ -2315,26 +2335,17 @@ static int wp_page_shared(struct vm_fault *vmf)
> > > >  			put_page(vmf->page);
> > > >  			return tmp;
> > > >  		}
> > > > -		/*
> > > > -		 * Since we dropped the lock we need to revalidate
> > > > -		 * the PTE as someone else may have changed it.  If
> > > > -		 * they did, we just return, as we can count on the
> > > > -		 * MMU to tell us if they didn't also make it writable.
> > > > -		 */
> > > > -		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > > > -						vmf->address, &vmf->ptl);
> > > > -		if (!pte_same(*vmf->pte, vmf->orig_pte)) {
> > > > +		tmp = finish_mkwrite_fault(vmf);
> > > > +		if (unlikely(!tmp || (tmp &
> > > > +				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> > > 
> > > The 'tmp' return from finish_mkwrite_fault() can only be 0 or VM_FAULT_WRITE.
> > > I think this test should just be 
> > > 
> > > 		if (unlikely(!tmp)) {
> > 
> > Right, finish_mkwrite_fault() cannot currently throw other errors than
> > "retry needed" which is indicated by tmp == 0. However I'd prefer to keep
> > symmetry with finish_fault() handler which can throw other errors and
> > better be prepared to handle them from finish_mkwrite_fault() as well.
> 
> Fair enough.  You can add:
> 
> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

Thanks. Actually, your question made me have a harder look at return values
from finish_mkwrite_fault() and I've added one more commit switching the
return values so that finish_mkwrite_fault() returns 0 on success and
VM_FAULT_NOPAGE if PTE changed. That is less confusing and even more
consistent with what finish_fault() returns.

								Honza
diff mbox

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1055f2ece80d..e5a014be8932 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -617,6 +617,7 @@  static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 		struct page *page);
 int finish_fault(struct vm_fault *vmf);
+int finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index f49e736d6a36..8c8cb7f2133e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2266,6 +2266,36 @@  oom:
 	return VM_FAULT_OOM;
 }
 
+/**
+ * finish_mkrite_fault - finish page fault making PTE writeable once the page
+ *			 page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault due
+ * to PTE being read-only once the mapped page is prepared. It handles locking
+ * of PTE and modifying it. The function returns VM_FAULT_WRITE on success,
+ * 0 when PTE got changed before we acquired PTE lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+				       &vmf->ptl);
+	/*
+	 * We might have raced with another page fault while we released the
+	 * pte_offset_map_lock.
+	 */
+	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+		pte_unmap_unlock(vmf->pte, vmf->ptl);
+		return 0;
+	}
+	wp_page_reuse(vmf);
+	return VM_FAULT_WRITE;
+}
+
 /*
  * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
  * mapping
@@ -2282,16 +2312,7 @@  static int wp_pfn_shared(struct vm_fault *vmf)
 		ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
 		if (ret & VM_FAULT_ERROR)
 			return ret;
-		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				vmf->address, &vmf->ptl);
-		/*
-		 * We might have raced with another page fault while we
-		 * released the pte_offset_map_lock.
-		 */
-		if (!pte_same(*vmf->pte, vmf->orig_pte)) {
-			pte_unmap_unlock(vmf->pte, vmf->ptl);
-			return 0;
-		}
+		return finish_mkwrite_fault(vmf);
 	}
 	wp_page_reuse(vmf);
 	return VM_FAULT_WRITE;
@@ -2301,7 +2322,6 @@  static int wp_page_shared(struct vm_fault *vmf)
 	__releases(vmf->ptl)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	int page_mkwrite = 0;
 
 	get_page(vmf->page);
 
@@ -2315,26 +2335,17 @@  static int wp_page_shared(struct vm_fault *vmf)
 			put_page(vmf->page);
 			return tmp;
 		}
-		/*
-		 * Since we dropped the lock we need to revalidate
-		 * the PTE as someone else may have changed it.  If
-		 * they did, we just return, as we can count on the
-		 * MMU to tell us if they didn't also make it writable.
-		 */
-		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-						vmf->address, &vmf->ptl);
-		if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+		tmp = finish_mkwrite_fault(vmf);
+		if (unlikely(!tmp || (tmp &
+				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			unlock_page(vmf->page);
-			pte_unmap_unlock(vmf->pte, vmf->ptl);
 			put_page(vmf->page);
-			return 0;
+			return tmp;
 		}
-		page_mkwrite = 1;
-	}
-
-	wp_page_reuse(vmf);
-	if (!page_mkwrite)
+	} else {
+		wp_page_reuse(vmf);
 		lock_page(vmf->page);
+	}
 	fault_dirty_shared_page(vma, vmf->page);
 	put_page(vmf->page);