diff mbox

[2/3] xfs: consolidate the various page fault handlers

Message ID 20170824152207.30729-3-hch@lst.de (mailing list archive)
State Superseded
Headers show

Commit Message

Christoph Hellwig Aug. 24, 2017, 3:22 p.m. UTC
Add a new __xfs_filemap_fault helper that implements all four page fault
callouts, and make these methods themselves small stubs that set the
correct write_fault flag, and exit early for the non-DAX case for the
hugepage related ones.

Life would be so much simpler if we only had one method for all this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c  | 120 +++++++++++++++--------------------------------------
 fs/xfs/xfs_trace.h |  24 +++++++++--
 2 files changed, 54 insertions(+), 90 deletions(-)

Comments

Ross Zwisler Aug. 24, 2017, 9:29 p.m. UTC | #1
On Thu, Aug 24, 2017 at 05:22:06PM +0200, Christoph Hellwig wrote:
> Add a new __xfs_filemap_fault helper that implements all four page fault
> callouts, and make these methods themselves small stubs that set the
> correct write_fault flag, and exit early for the non-DAX case for the
> hugepage related ones.
> 
> Life would be so much simpler if we only had one method for all this.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

A few nits and a question below.  This looks much simpler and gets rid of a
lot of duplication.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

<>
>  STATIC int
>  xfs_filemap_huge_fault(
>  	struct vm_fault		*vmf,
>  	enum page_entry_size	pe_size)
>  {
> -	struct inode		*inode = file_inode(vmf->vma->vm_file);
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	int			ret;
> -
> -	if (!IS_DAX(inode))
> +	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
>  		return VM_FAULT_FALLBACK;
>  
> -	trace_xfs_filemap_huge_fault(ip);
> -
> -	if (vmf->flags & FAULT_FLAG_WRITE) {
> -		sb_start_pagefault(inode->i_sb);
> -		file_update_time(vmf->vma->vm_file);
> -	}
> -
> -	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -	ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops);
> -	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -
> -	if (vmf->flags & FAULT_FLAG_WRITE)
> -		sb_end_pagefault(inode->i_sb);
> +	/* DAX can shortcut the normal fault path on write faults! */

Does this comment still make sense for the PMD case?  Here's what I *think* it
means: For DAX write faults we make the PTE writeable in the ->fault() code
and don't rely on a follow-up ->page_mkwrite() call.  This happens in
do_shared_fault(), where we return before the ->page_mkwrite() call because
the DAX write fault returns VM_FAULT_NOPAGE.

This is in contrast to the normal page fault case where the ->fault() call
ends up calling filemap_fault(), which populates the page read-only, and then
do_shared_fault() does a follow-up ->page_mkwrite() call which makes the page
writable.

First, is the above correct?  :)  If so, I think the comment doesn't make
sense for the PMD fault path because in that case we always make the PMD
writeable in the first ->huge_fault() call, as there is no follow-up
*_mkwrite()?

> +	return __xfs_filemap_fault(vmf, pe_size,
> +			(vmf->flags & FAULT_FLAG_WRITE));
> +}
>  
> -	return ret;
> +STATIC int
> +xfs_filemap_page_mkwrite(
> +	struct vm_fault		*vmf)
> +{
> +	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
>  }
>  
> -/*
> - * pfn_mkwrite was originally inteneded to ensure we capture time stamp
> - * updates on write faults. In reality, it's need to serialise against
> - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
> - * to ensure we serialise the fault barrier in place.
> - */
>  static int

   STATIC

>  xfs_filemap_pfn_mkwrite(
>  	struct vm_fault		*vmf)
>  {
> -
> -	struct inode		*inode = file_inode(vmf->vma->vm_file);
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	int			ret = VM_FAULT_NOPAGE;
> -	loff_t			size;
> -
> -	trace_xfs_filemap_pfn_mkwrite(ip);
> -
> -	sb_start_pagefault(inode->i_sb);
> -	file_update_time(vmf->vma->vm_file);
> -
> -	/* check if the faulting page hasn't raced with truncate */
> -	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
> -	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;

I see that this size check is gone from the new code, and we now rely on the
equivalent check in dax_iomap_fault().  Nice.

> -	if (vmf->pgoff >= size)
> -		ret = VM_FAULT_SIGBUS;
> -	else if (IS_DAX(inode))
> -		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
> -	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
> -	sb_end_pagefault(inode->i_sb);
> -	return ret;
> -
> +	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
> +		return VM_FAULT_NOPAGE;
> +	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
>  }
>  
>  static const struct vm_operations_struct xfs_file_vm_ops = {
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index bcc3cdf8e1c5..3f41d5340eb8 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -689,10 +689,26 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
>  DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
>  DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
>  
> -DEFINE_INODE_EVENT(xfs_filemap_fault);
> -DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
> -DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
> -DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
> +TRACE_EVENT(xfs_filemap_fault,
> +	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
> +		 bool write_fault),
> +	TP_ARGS(ip, pe_size, write_fault),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_ino_t, ino)
> +		__field(enum page_entry_size, pe_size)
> +		__field(bool, write_fault)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> +		__entry->ino = ip->i_ino;
> +		__entry->pe_size = pe_size;
> +		__entry->write_fault = write_fault;
> +	),
> +	TP_printk("dev %d:%d ino 0x%llx pe_size %d write_fault %d",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->ino, __entry->pe_size, __entry->write_fault)
> +)

I wonder if pe_size would be more readable as a string via __print_symbolic()
so we see "pe_size PMD" instead of "pe_size 1" from enum page_entry_size?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 25, 2017, 7:15 a.m. UTC | #2
On Thu, Aug 24, 2017 at 03:29:42PM -0600, Ross Zwisler wrote:
> > +	/* DAX can shortcut the normal fault path on write faults! */
> 
> Does this comment still make sense for the PMD case?

I kept the code as before, and also kept the comment intent as-is
(although I shortend it a bit).

> Here's what I *think* it
> means: For DAX write faults we make the PTE writeable in the ->fault() code
> and don't rely on a follow-up ->page_mkwrite() call.  This happens in
> do_shared_fault(), where we return before the ->page_mkwrite() call because
> the DAX write fault returns VM_FAULT_NOPAGE.
> 
> This is in contrast to the normal page fault case where the ->fault() call
> ends up calling filemap_fault(), which populates the page read-only, and then
> do_shared_fault() does a follow-up ->page_mkwrite() call which makes the page
> writable.
> 
> First, is the above correct?  :)

Yes.

> If so, I think the comment doesn't make
> sense for the PMD fault path because in that case we always make the PMD
> writeable in the first ->huge_fault() call, as there is no follow-up
> *_mkwrite()?

Well, there is non-DAX huge_fault case.  It still is different from the
normal non-hugepage fault case, so the comment still makes some sense,
but maybe I should remove the DAX.  That being said if we eventually
get huge page page cache support (patches are on the list) I suspect
they'll work like the normal fault path, not the DAX path.

> >  static int
> 
>    STATIC

We're phasing that out, so I should probably remove it where it's
currently newly added/moved in the patch.

> I see that this size check is gone from the new code, and we now rely on the
> equivalent check in dax_iomap_fault().  Nice.

Yes.  And I actually used to mention that in the changelog, but it got
lost.  I'll re-add it.

> I wonder if pe_size would be more readable as a string via __print_symbolic()
> so we see "pe_size PMD" instead of "pe_size 1" from enum page_entry_size?

Probably.  I was just lazy :)  If I want to go all the way I could also
use a __print_flags for the FAULT_FLAG_* flags.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ross Zwisler Aug. 28, 2017, 5:50 p.m. UTC | #3
On Fri, Aug 25, 2017 at 09:15:38AM +0200, Christoph Hellwig wrote:
> On Thu, Aug 24, 2017 at 03:29:42PM -0600, Ross Zwisler wrote:
<>
> > I wonder if pe_size would be more readable as a string via __print_symbolic()
> > so we see "pe_size PMD" instead of "pe_size 1" from enum page_entry_size?
> 
> Probably.  I was just lazy :)  If I want to go all the way I could also
> use a __print_flags for the FAULT_FLAG_* flags.

Yep - I'm doing that in include/trace/events/fs_dax.h if you want to just copy
& paste.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 28, 2017, 5:52 p.m. UTC | #4
On Mon, Aug 28, 2017 at 11:50:14AM -0600, Ross Zwisler wrote:
> Yep - I'm doing that in include/trace/events/fs_dax.h if you want to just copy
> & paste.

Oh, nice.  But maybe we need the trace point in the core MM instead
of in XFS and the DAX code? :)
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ross Zwisler Aug. 28, 2017, 8:09 p.m. UTC | #5
On Mon, Aug 28, 2017 at 07:52:57PM +0200, Christoph Hellwig wrote:
> On Mon, Aug 28, 2017 at 11:50:14AM -0600, Ross Zwisler wrote:
> > Yep - I'm doing that in include/trace/events/fs_dax.h if you want to just copy
> > & paste.
> 
> Oh, nice.  But maybe we need the trace point in the core MM instead
> of in XFS and the DAX code? :)

Yea, there aren't any tracepoints at all in mm/memory.c.  I'm not sure if
that's because we just haven't gotten around to putting them in, or if for
some reason it was intentional to not trace that bit of code?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4213f02325a2..460ca9639e66 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1012,34 +1012,36 @@  xfs_file_llseek(
  *         i_lock (XFS - extent map serialisation)
  */
 
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
 STATIC int
-xfs_filemap_page_mkwrite(
-	struct vm_fault		*vmf)
+__xfs_filemap_fault(
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size,
+	bool			write_fault)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
+	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
 
-	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+	trace_xfs_filemap_fault(ip, pe_size, write_fault);
 
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	if (write_fault) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vmf->vma->vm_file);
+	}
 
+	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops);
 	} else {
-		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+		if (write_fault)
+			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+		else
+			ret = filemap_fault(vmf);
 	}
-
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
 
+	if (write_fault)
+		sb_end_pagefault(inode->i_sb);
 	return ret;
 }
 
@@ -1047,93 +1049,39 @@  STATIC int
 xfs_filemap_fault(
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	int			ret;
-
-	trace_xfs_filemap_fault(XFS_I(inode));
-
 	/* DAX can shortcut the normal fault path on write faults! */
-	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
-		return xfs_filemap_page_mkwrite(vmf);
-
-	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
-	else
-		ret = filemap_fault(vmf);
-	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-
-	return ret;
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+			IS_DAX(file_inode(vmf->vma->vm_file)) &&
+			(vmf->flags & FAULT_FLAG_WRITE));
 }
 
-/*
- * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
- * both read and write faults. Hence we need to handle both cases. There is no
- * ->huge_mkwrite callout for huge pages, so we have a single function here to
- * handle both cases here. @flags carries the information on the type of fault
- * occuring.
- */
 STATIC int
 xfs_filemap_huge_fault(
 	struct vm_fault		*vmf,
 	enum page_entry_size	pe_size)
 {
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret;
-
-	if (!IS_DAX(inode))
+	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
 		return VM_FAULT_FALLBACK;
 
-	trace_xfs_filemap_huge_fault(ip);
-
-	if (vmf->flags & FAULT_FLAG_WRITE) {
-		sb_start_pagefault(inode->i_sb);
-		file_update_time(vmf->vma->vm_file);
-	}
-
-	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops);
-	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-
-	if (vmf->flags & FAULT_FLAG_WRITE)
-		sb_end_pagefault(inode->i_sb);
+	/* DAX can shortcut the normal fault path on write faults! */
+	return __xfs_filemap_fault(vmf, pe_size,
+			(vmf->flags & FAULT_FLAG_WRITE));
+}
 
-	return ret;
+STATIC int
+xfs_filemap_page_mkwrite(
+	struct vm_fault		*vmf)
+{
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
-/*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
- */
 static int
 xfs_filemap_pfn_mkwrite(
 	struct vm_fault		*vmf)
 {
-
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret = VM_FAULT_NOPAGE;
-	loff_t			size;
-
-	trace_xfs_filemap_pfn_mkwrite(ip);
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-
-	/* check if the faulting page hasn't raced with truncate */
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-
+	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
+		return VM_FAULT_NOPAGE;
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bcc3cdf8e1c5..3f41d5340eb8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,10 +689,26 @@  DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
-DEFINE_INODE_EVENT(xfs_filemap_fault);
-DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
-DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
+TRACE_EVENT(xfs_filemap_fault,
+	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
+		 bool write_fault),
+	TP_ARGS(ip, pe_size, write_fault),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(enum page_entry_size, pe_size)
+		__field(bool, write_fault)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->pe_size = pe_size;
+		__entry->write_fault = write_fault;
+	),
+	TP_printk("dev %d:%d ino 0x%llx pe_size %d write_fault %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->pe_size, __entry->write_fault)
+)
 
 DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),