diff mbox

[v4,18/18] xfs, dax: wire up dax_flush_dma support via a new xfs_sync_dma helper

Message ID 151407705701.38751.6157642918168721562.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams Dec. 24, 2017, 12:57 a.m. UTC
xfs_break_layouts() scans for active pNFS layouts, drops locks and
rescans for those layouts to be broken. xfs_sync_dma performs
xfs_break_layouts and also scans for active dax-dma pages, drops locks
and rescans for those pages to go idle.

dax_flush_dma handles synchronizing against new page-busy events
(get_user_pages). iIt invalidates all mappings to trigger the
get_user_pages slow path which will eventually block on the
XFS_MMAPLOCK. If it finds a dma-busy page it waits for a page-idle
callback that will fire when the page reference count reaches 1 (recall
ZONE_DEVICE pages are idle at count 1). While it is waiting, it drops
locks so we do not deadlock the process that might be trying to elevate
the page count of more pages before arranging for any of them to go idle
as is typically the case of iov_iter_get_pages.

dax_flush_dma relies on the fs-provided wait_atomic_t_action_f
(xfs_wait_dax_page) to handle evaluating the page reference count and
dropping locks when waiting.

Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/Makefile    |    3 ++
 fs/xfs/xfs_dma.c   |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_dma.h   |   24 +++++++++++++++
 fs/xfs/xfs_file.c  |    6 ++--
 fs/xfs/xfs_ioctl.c |    7 ++--
 fs/xfs/xfs_iops.c  |    7 +++-
 6 files changed, 118 insertions(+), 10 deletions(-)
 create mode 100644 fs/xfs/xfs_dma.c
 create mode 100644 fs/xfs/xfs_dma.h

Comments

Darrick J. Wong Jan. 2, 2018, 9:07 p.m. UTC | #1
On Sat, Dec 23, 2017 at 04:57:37PM -0800, Dan Williams wrote:
> xfs_break_layouts() scans for active pNFS layouts, drops locks and
> rescans for those layouts to be broken. xfs_sync_dma performs
> xfs_break_layouts and also scans for active dax-dma pages, drops locks
> and rescans for those pages to go idle.
> 
> dax_flush_dma handles synchronizing against new page-busy events
> (get_user_pages). iIt invalidates all mappings to trigger the
> get_user_pages slow path which will eventually block on the
> XFS_MMAPLOCK. If it finds a dma-busy page it waits for a page-idle
> callback that will fire when the page reference count reaches 1 (recall
> ZONE_DEVICE pages are idle at count 1). While it is waiting, it drops
> locks so we do not deadlock the process that might be trying to elevate
> the page count of more pages before arranging for any of them to go idle
> as is typically the case of iov_iter_get_pages.
> 
> dax_flush_dma relies on the fs-provided wait_atomic_t_action_f
> (xfs_wait_dax_page) to handle evaluating the page reference count and
> dropping locks when waiting.
> 
> Cc: Jan Kara <jack@suse.cz>
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  fs/xfs/Makefile    |    3 ++
>  fs/xfs/xfs_dma.c   |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_dma.h   |   24 +++++++++++++++
>  fs/xfs/xfs_file.c  |    6 ++--
>  fs/xfs/xfs_ioctl.c |    7 ++--
>  fs/xfs/xfs_iops.c  |    7 +++-
>  6 files changed, 118 insertions(+), 10 deletions(-)
>  create mode 100644 fs/xfs/xfs_dma.c
>  create mode 100644 fs/xfs/xfs_dma.h
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 7ceb41a9786a..f2cdc5a3eb6c 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -129,6 +129,9 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
>  				   xfs_qm.o \
>  				   xfs_quotaops.o
>  
> +# dax dma
> +xfs-$(CONFIG_FS_DAX)		+= xfs_dma.o
> +
>  # xfs_rtbitmap is shared with libxfs
>  xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
>  
> diff --git a/fs/xfs/xfs_dma.c b/fs/xfs/xfs_dma.c
> new file mode 100644
> index 000000000000..3df1a51a76c4
> --- /dev/null
> +++ b/fs/xfs/xfs_dma.c
> @@ -0,0 +1,81 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0
> + * Copyright(c) 2017 Intel Corporation. All rights reserved.
> + */
> +#include <linux/dax.h>
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_shared.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_bit.h"
> +#include "xfs_sb.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_inode.h"
> +#include "xfs_pnfs.h"
> +
> +/*
> + * xfs_wait_dax_page - helper for dax_flush_dma to drop locks and sleep
> + * wait for a page idle event. Returns 1 if the locks did not need to be
> + * dropped and the page is idle, returns -EINTR if the sleep was
> + * interrupted and returns 1 when it slept. dax_flush_dma()
> + * retries/rescans all mappings when the lock is dropped.

What does the return 0 case signify?  I'm guessing "returns 1 if the
locks did not need to be dropped and the page is idle"?

> + */
> +static int xfs_wait_dax_page(
> +	atomic_t		*count,
> +	unsigned int		mode)
> +{

static int
xfs_wait_dax_page(
	atomci_t		*count,
	unsigned int		mode)
{

IOWs, the function name gets its own line.  The same applies to every
other function definition.

> +	uint			iolock = XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL;
> +	struct page 		*page = refcount_to_page(count);

Space after 'page' ^^^ but before tab...

> +	struct address_space	*mapping = page->mapping;
> +	struct inode		*inode = mapping->host;
> +	struct xfs_inode	*ip = XFS_I(inode);
> +
> +	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL));
> +
> +	if (page_ref_count(page) == 1)
> +		return 0;
> +
> +	xfs_iunlock(ip, iolock);
> +	schedule();
> +	xfs_ilock(ip, iolock);
> +
> +	if (signal_pending_state(mode, current))
> +		return -EINTR;
> +	return 1;
> +}
> +
> +/*
> + * Synchronize [R]DMA before changing the file's block map. For pNFS,
> + * recall all layouts. For DAX, wait for transient DMA to complete. All
> + * other DMA is handled by pinning page cache pages.
> + *
> + * iolock must held XFS_IOLOCK_SHARED or XFS_IOLOCK_EXCL on entry and
> + * will be XFS_IOLOCK_EXCL and XFS_MMAPLOCK_EXCL on exit.

Is it guaranteed that we never emerge from xfs_break_layouts with
IOLOCK_SHARED?  I /think/ the answer is yes, but this seems like a
subtlety that would be easy to screw up.

> + */
> +int xfs_sync_dma(
> +	struct inode		*inode,
> +	uint			*iolock)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	int			error;
> +
> +	while (true) {
> +		error = xfs_break_layouts(inode, iolock);
> +		if (error)
> +			break;
> +
> +		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> +		*iolock |= XFS_MMAPLOCK_EXCL;
> +
> +		error = dax_flush_dma(inode->i_mapping, xfs_wait_dax_page);
> +		if (error <= 0)
> +			break;
> +		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
> +		*iolock &= ~XFS_MMAPLOCK_EXCL;
> +	}
> +
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_dma.h b/fs/xfs/xfs_dma.h
> new file mode 100644
> index 000000000000..29635639b073
> --- /dev/null
> +++ b/fs/xfs/xfs_dma.h
> @@ -0,0 +1,24 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0
> + * Copyright(c) 2017 Intel Corporation. All rights reserved.
> + */
> +#ifndef __XFS_DMA__
> +#define __XFS_DMA__
> +#ifdef CONFIG_FS_DAX
> +int xfs_sync_dma(struct inode *inode, uint *iolock);
> +#else
> +#include "xfs_pnfs.h"
> +
> +static inline int xfs_sync_dma(struct inode *inode, uint *iolock)

I think we need to do this prior to reflinking into a file too, right?
Or at least we would if dax+reflink were a supported config.  I think
the reason we've never tripped over this is that neither pnfs nor dax
will have anything to do with reflinked files.

(brain creaking, trying to get back up to speed....)

--D

> +{
> +	int error = xfs_break_layouts(inode, iolock);
> +
> +	if (error)
> +		return error;
> +
> +	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_EXCL);
> +	*iolock |= XFS_MMAPLOCK_EXCL;
> +	return 0;
> +}
> +#endif /* CONFIG_FS_DAX */
> +#endif /* __XFS_DMA__ */
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 6df0c133a61e..84fc178da656 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -37,6 +37,7 @@
>  #include "xfs_log.h"
>  #include "xfs_icache.h"
>  #include "xfs_pnfs.h"
> +#include "xfs_dma.h"
>  #include "xfs_iomap.h"
>  #include "xfs_reflink.h"
>  
> @@ -778,12 +779,11 @@ xfs_file_fallocate(
>  		return -EOPNOTSUPP;
>  
>  	xfs_ilock(ip, iolock);
> -	error = xfs_break_layouts(inode, &iolock);
> +	error = xfs_sync_dma(inode, &iolock);
>  	if (error)
>  		goto out_unlock;
>  
> -	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> -	iolock |= XFS_MMAPLOCK_EXCL;
> +	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>  
>  	if (mode & FALLOC_FL_PUNCH_HOLE) {
>  		error = xfs_free_file_space(ip, offset, len);
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 20dc65fef6a4..4340bef658b0 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -39,7 +39,7 @@
>  #include "xfs_icache.h"
>  #include "xfs_symlink.h"
>  #include "xfs_trans.h"
> -#include "xfs_pnfs.h"
> +#include "xfs_dma.h"
>  #include "xfs_acl.h"
>  #include "xfs_btree.h"
>  #include <linux/fsmap.h>
> @@ -643,12 +643,11 @@ xfs_ioc_space(
>  		return error;
>  
>  	xfs_ilock(ip, iolock);
> -	error = xfs_break_layouts(inode, &iolock);
> +	error = xfs_sync_dma(inode, &iolock);
>  	if (error)
>  		goto out_unlock;
>  
> -	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> -	iolock |= XFS_MMAPLOCK_EXCL;
> +	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>  
>  	switch (bf->l_whence) {
>  	case 0: /*SEEK_SET*/
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 67bd97edc73b..c1055337b233 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -37,7 +37,7 @@
>  #include "xfs_da_btree.h"
>  #include "xfs_dir2.h"
>  #include "xfs_trans_space.h"
> -#include "xfs_pnfs.h"
> +#include "xfs_dma.h"
>  #include "xfs_iomap.h"
>  
>  #include <linux/capability.h>
> @@ -1030,11 +1030,12 @@ xfs_vn_setattr(
>  		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
>  		uint			iolock = XFS_IOLOCK_EXCL;
>  
> -		error = xfs_break_layouts(d_inode(dentry), &iolock);
> +		error = xfs_sync_dma(d_inode(dentry), &iolock);
>  		if (error)
>  			return error;
>  
> -		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> +		ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
> +
>  		error = xfs_vn_setattr_size(dentry, iattr);
>  		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
>  	} else {
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Jan. 2, 2018, 11 p.m. UTC | #2
On Sat, Dec 23, 2017 at 04:57:37PM -0800, Dan Williams wrote:
> xfs_break_layouts() scans for active pNFS layouts, drops locks and
> rescans for those layouts to be broken. xfs_sync_dma performs
> xfs_break_layouts and also scans for active dax-dma pages, drops locks
> and rescans for those pages to go idle.
> 
> dax_flush_dma handles synchronizing against new page-busy events
> (get_user_pages). iIt invalidates all mappings to trigger the
> get_user_pages slow path which will eventually block on the
> XFS_MMAPLOCK. If it finds a dma-busy page it waits for a page-idle
> callback that will fire when the page reference count reaches 1 (recall
> ZONE_DEVICE pages are idle at count 1). While it is waiting, it drops
> locks so we do not deadlock the process that might be trying to elevate
> the page count of more pages before arranging for any of them to go idle
> as is typically the case of iov_iter_get_pages.
> 
> dax_flush_dma relies on the fs-provided wait_atomic_t_action_f
> (xfs_wait_dax_page) to handle evaluating the page reference count and
> dropping locks when waiting.

I don't see a problem with supporting this functionality, but I
see lots of problems with the code being presented. First of all,
I think the "sync dma" abstraction here is all wrong.

In the case of the filesystem, we don't care about whether DMA has
completed or not, and we *shouldn't have to care* about deep, dark
secrets of other subsystems.

If I read the current code, I see this in all the "truncate" paths:

	start op
	break layout leases
	change layout

and in the IO path:

	start IO
	break layout leases
	map IO
	issue IO

What this change does is make the truncate paths read:

	start op
	sync DMA
	change layout

but the IO path is unchanged. (This is not explained in comments or
commit messages).

And I look at that "sync DMA" step and wonder why the hell we need
to "sync DMA" because DMA has nothing to do with high level
filesystem code. It doesn't tell me anything obvious about why we
need to do this, nor does it tell me what we're actually
synchronising against.

What we care about in the filesystem code is whether there are
existing external references to file layout. If there's an external
reference, then it has to be broken before we can proceed and modify
the file layout. We don't care what owns that reference, just that
it has to broken before we continue.

AFAIC, these DMA references are just another external layout
reference that needs to be broken.  IOWs, this "sync DMA" complexity
needs to go inside xfs_break_layouts() as it is part of breaking the
external reference to the file layout - it does not replace the
layout breaking abstraction and so the implementation needs to
reflect that.

> + * Synchronize [R]DMA before changing the file's block map. For pNFS,
> + * recall all layouts. For DAX, wait for transient DMA to complete. All
> + * other DMA is handled by pinning page cache pages.
> + *
> + * iolock must held XFS_IOLOCK_SHARED or XFS_IOLOCK_EXCL on entry and
> + * will be XFS_IOLOCK_EXCL and XFS_MMAPLOCK_EXCL on exit.
> + */
> +int xfs_sync_dma(
> +	struct inode		*inode,
> +	uint			*iolock)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	int			error;
> +
> +	while (true) {
> +		error = xfs_break_layouts(inode, iolock);
> +		if (error)
> +			break;
> +
> +		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> +		*iolock |= XFS_MMAPLOCK_EXCL;
> +
> +		error = dax_flush_dma(inode->i_mapping, xfs_wait_dax_page);
> +		if (error <= 0)
> +			break;
> +		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
> +		*iolock &= ~XFS_MMAPLOCK_EXCL;
> +	}

At this level the loop seems sub-optimal. If we don't drop the
IOLOCK, then we have no reason to call xfs_break_layouts() a second
time.  Hence in isolation this loop doesn' make sense. Yes, I
realise that dax_flush_dma() can result in all locks on the inode
being dropped, but that's hidden in another function whose calling
scope is not at all obvious from this code.

Also, xfs_wait_dax_page() assumes we have IOLOCK_EXCL held when it
is called. Nothing enforces the requirement that xfs_sync_dma() is
passed XFS_IOLOCK_EXCL, and so such assumptions cannot be made.
Even if it was, I really dislike the idea of a function that
/assumes/ lock state - that's a landmine that will bite us in the
rear end at some unexpected point in the future. If you need to
cycle held locks on an inode, you need to pass the held lock state
to the function.

Another gripe I have is that calling xfs_sync_dma() implies the mmap
lock is held exclusively on return. Hiding this locking inside
xfs_sync_dma removes the code documentation that large tracts of
code are protected against page faults by the XFS_MMAPLOCK_EXCL lock
call.  Instead of knowing at a glance that the truncate path
(xfs_vn_setattr) is protected against page faults, I have to
remember that xfs_sync_dma() now does this.

This goes back to my initial comments of "what the hell does "sync
dma" mean in a filesystem context?" - it certainly doesn't make me
think about inode locking. I don't like hidden/implicitly locking
like this, because it breaks both the least-surprise and the
self-documenting code principles....

Cheers,

Dave.
Dan Williams Jan. 3, 2018, 2:21 a.m. UTC | #3
On Tue, Jan 2, 2018 at 3:00 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Sat, Dec 23, 2017 at 04:57:37PM -0800, Dan Williams wrote:
>> xfs_break_layouts() scans for active pNFS layouts, drops locks and
>> rescans for those layouts to be broken. xfs_sync_dma performs
>> xfs_break_layouts and also scans for active dax-dma pages, drops locks
>> and rescans for those pages to go idle.
>>
>> dax_flush_dma handles synchronizing against new page-busy events
>> (get_user_pages). iIt invalidates all mappings to trigger the
>> get_user_pages slow path which will eventually block on the
>> XFS_MMAPLOCK. If it finds a dma-busy page it waits for a page-idle
>> callback that will fire when the page reference count reaches 1 (recall
>> ZONE_DEVICE pages are idle at count 1). While it is waiting, it drops
>> locks so we do not deadlock the process that might be trying to elevate
>> the page count of more pages before arranging for any of them to go idle
>> as is typically the case of iov_iter_get_pages.
>>
>> dax_flush_dma relies on the fs-provided wait_atomic_t_action_f
>> (xfs_wait_dax_page) to handle evaluating the page reference count and
>> dropping locks when waiting.
>
> I don't see a problem with supporting this functionality, but I
> see lots of problems with the code being presented. First of all,
> I think the "sync dma" abstraction here is all wrong.
>
> In the case of the filesystem, we don't care about whether DMA has
> completed or not, and we *shouldn't have to care* about deep, dark
> secrets of other subsystems.
>
> If I read the current code, I see this in all the "truncate" paths:
>
>         start op
>         break layout leases
>         change layout
>
> and in the IO path:
>
>         start IO
>         break layout leases
>         map IO
>         issue IO
>
> What this change does is make the truncate paths read:
>
>         start op
>         sync DMA
>         change layout
>
> but the IO path is unchanged. (This is not explained in comments or
> commit messages).
>
> And I look at that "sync DMA" step and wonder why the hell we need
> to "sync DMA" because DMA has nothing to do with high level
> filesystem code. It doesn't tell me anything obvious about why we
> need to do this, nor does it tell me what we're actually
> synchronising against.
>
> What we care about in the filesystem code is whether there are
> existing external references to file layout. If there's an external
> reference, then it has to be broken before we can proceed and modify
> the file layout. We don't care what owns that reference, just that
> it has to broken before we continue.
>
> AFAIC, these DMA references are just another external layout
> reference that needs to be broken.  IOWs, this "sync DMA" complexity
> needs to go inside xfs_break_layouts() as it is part of breaking the
> external reference to the file layout - it does not replace the
> layout breaking abstraction and so the implementation needs to
> reflect that.

These two sentences from the xfs_break_layouts() comment scared me
down this path of distinguishing dax-dma waiting from pNFS layout
lease break waiting:

---

"Additionally we call it during the write operation, where aren't
concerned about exposing unallocated blocks but just want to provide
basic synchronization between a local writer and pNFS clients.  mmap
writes would also benefit from this sort of synchronization, but due
to the tricky locking rules in the page fault path we don't bother."

---

I was not sure about holding XFS_MMAPLOCK_EXCL over
xfs_break_layouts() where it has historically not been held, and I was
worried about the potential deadlock of requiring all pages to be
unmapped and idle during a write. I.e. would we immediately deadlock
if userspace performed  direct-I/O to a file with a source buffer that
was mapped from that same file?

In general though, I agree that xfs_break_layouts() should comprehend
both cases. I'll investigate if the deadlock is real and perhaps add a
flag to xfs_break_layouts to distinguish the IO path from the truncate
paths to at least make that detail internal to the layout breaking
mechanism.

>> + * Synchronize [R]DMA before changing the file's block map. For pNFS,
>> + * recall all layouts. For DAX, wait for transient DMA to complete. All
>> + * other DMA is handled by pinning page cache pages.
>> + *
>> + * iolock must held XFS_IOLOCK_SHARED or XFS_IOLOCK_EXCL on entry and
>> + * will be XFS_IOLOCK_EXCL and XFS_MMAPLOCK_EXCL on exit.
>> + */
>> +int xfs_sync_dma(
>> +     struct inode            *inode,
>> +     uint                    *iolock)
>> +{
>> +     struct xfs_inode        *ip = XFS_I(inode);
>> +     int                     error;
>> +
>> +     while (true) {
>> +             error = xfs_break_layouts(inode, iolock);
>> +             if (error)
>> +                     break;
>> +
>> +             xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
>> +             *iolock |= XFS_MMAPLOCK_EXCL;
>> +
>> +             error = dax_flush_dma(inode->i_mapping, xfs_wait_dax_page);
>> +             if (error <= 0)
>> +                     break;
>> +             xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
>> +             *iolock &= ~XFS_MMAPLOCK_EXCL;
>> +     }
>
> At this level the loop seems sub-optimal. If we don't drop the
> IOLOCK, then we have no reason to call xfs_break_layouts() a second
> time.  Hence in isolation this loop doesn' make sense. Yes, I
> realise that dax_flush_dma() can result in all locks on the inode
> being dropped, but that's hidden in another function whose calling
> scope is not at all obvious from this code.
>
> Also, xfs_wait_dax_page() assumes we have IOLOCK_EXCL held when it
> is called. Nothing enforces the requirement that xfs_sync_dma() is
> passed XFS_IOLOCK_EXCL, and so such assumptions cannot be made.
> Even if it was, I really dislike the idea of a function that
> /assumes/ lock state - that's a landmine that will bite us in the
> rear end at some unexpected point in the future. If you need to
> cycle held locks on an inode, you need to pass the held lock state
> to the function.

I agree, and I thought about this, but at the time the callback is
made the only way we could pass the lock context to
xfs_wait_dax_page() would be to temporarily store it in the 'struct
page' which seemed ugly at first glance.

However, we do have space, we could alias it with the other unused
8-bytes of page->lru.

At least that cleans up the filesystem interface to not need to make
implicit locking assumptions... I'll go that route.

> Another gripe I have is that calling xfs_sync_dma() implies the mmap
> lock is held exclusively on return. Hiding this locking inside
> xfs_sync_dma removes the code documentation that large tracts of
> code are protected against page faults by the XFS_MMAPLOCK_EXCL lock
> call.  Instead of knowing at a glance that the truncate path
> (xfs_vn_setattr) is protected against page faults, I have to
> remember that xfs_sync_dma() now does this.
>
> This goes back to my initial comments of "what the hell does "sync
> dma" mean in a filesystem context?" - it certainly doesn't make me
> think about inode locking. I don't like hidden/implicitly locking
> like this, because it breaks both the least-surprise and the
> self-documenting code principles....

Thanks for this Dave, it's clear. I'll a spin a new version with some
of the reworks proposed above, but holler if those don't address your
core concern.
Dave Chinner Jan. 3, 2018, 7:51 a.m. UTC | #4
On Tue, Jan 02, 2018 at 06:21:13PM -0800, Dan Williams wrote:
> On Tue, Jan 2, 2018 at 3:00 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Sat, Dec 23, 2017 at 04:57:37PM -0800, Dan Williams wrote:
> >> xfs_break_layouts() scans for active pNFS layouts, drops locks and
> >> rescans for those layouts to be broken. xfs_sync_dma performs
> >> xfs_break_layouts and also scans for active dax-dma pages, drops locks
> >> and rescans for those pages to go idle.
> >>
> >> dax_flush_dma handles synchronizing against new page-busy events
> >> (get_user_pages). iIt invalidates all mappings to trigger the
> >> get_user_pages slow path which will eventually block on the
> >> XFS_MMAPLOCK. If it finds a dma-busy page it waits for a page-idle
> >> callback that will fire when the page reference count reaches 1 (recall
> >> ZONE_DEVICE pages are idle at count 1). While it is waiting, it drops
> >> locks so we do not deadlock the process that might be trying to elevate
> >> the page count of more pages before arranging for any of them to go idle
> >> as is typically the case of iov_iter_get_pages.
> >>
> >> dax_flush_dma relies on the fs-provided wait_atomic_t_action_f
> >> (xfs_wait_dax_page) to handle evaluating the page reference count and
> >> dropping locks when waiting.
> >
> > I don't see a problem with supporting this functionality, but I
> > see lots of problems with the code being presented. First of all,
> > I think the "sync dma" abstraction here is all wrong.
> >
> > In the case of the filesystem, we don't care about whether DMA has
> > completed or not, and we *shouldn't have to care* about deep, dark
> > secrets of other subsystems.
> >
> > If I read the current code, I see this in all the "truncate" paths:
> >
> >         start op
> >         break layout leases
> >         change layout
> >
> > and in the IO path:
> >
> >         start IO
> >         break layout leases
> >         map IO
> >         issue IO
> >
> > What this change does is make the truncate paths read:
> >
> >         start op
> >         sync DMA
> >         change layout
> >
> > but the IO path is unchanged. (This is not explained in comments or
> > commit messages).
> >
> > And I look at that "sync DMA" step and wonder why the hell we need
> > to "sync DMA" because DMA has nothing to do with high level
> > filesystem code. It doesn't tell me anything obvious about why we
> > need to do this, nor does it tell me what we're actually
> > synchronising against.
> >
> > What we care about in the filesystem code is whether there are
> > existing external references to file layout. If there's an external
> > reference, then it has to be broken before we can proceed and modify
> > the file layout. We don't care what owns that reference, just that
> > it has to broken before we continue.
> >
> > AFAIC, these DMA references are just another external layout
> > reference that needs to be broken.  IOWs, this "sync DMA" complexity
> > needs to go inside xfs_break_layouts() as it is part of breaking the
> > external reference to the file layout - it does not replace the
> > layout breaking abstraction and so the implementation needs to
> > reflect that.
> 
> These two sentences from the xfs_break_layouts() comment scared me
> down this path of distinguishing dax-dma waiting from pNFS layout
> lease break waiting:
> 
> ---
> 
> "Additionally we call it during the write operation, where aren't
> concerned about exposing unallocated blocks but just want to provide
> basic synchronization between a local writer and pNFS clients.  mmap
> writes would also benefit from this sort of synchronization, but due
> to the tricky locking rules in the page fault path we don't bother."
> ---

The pnfs code  went into 3.20 (4.0, IIRC), whilst the XFS_MMAPLOCK
code went into 4.1. So the pnfs code was written and tested by
Christoph a long time before I added the XFS_MMAPLOCK, despite them
landing only one release apart. We've never really gone back to look
at this because there hasn't been a need until now....

> I was not sure about holding XFS_MMAPLOCK_EXCL over
> xfs_break_layouts() where it has historically not been held, and I was
> worried about the potential deadlock of requiring all pages to be
> unmapped and idle during a write. I.e. would we immediately deadlock
> if userspace performed  direct-I/O to a file with a source buffer that
> was mapped from that same file?

Most likely.

> In general though, I agree that xfs_break_layouts() should comprehend
> both cases. I'll investigate if the deadlock is real and perhaps add a
> flag to xfs_break_layouts to distinguish the IO path from the truncate
> paths to at least make that detail internal to the layout breaking
> mechanism.

We can't hold the XFS_MMAPLOCK over the direct IO write submission
path. That will cause deadlocks as it will invert the
mmap_sem/XFS_MMAPLOCK order via get_user_pages_fast(). That's the
whole reason we have the IOLOCK and the MMAPLOCK - neither can be
taken in both the IO path and the page fault path because of
mmap_sem inversions, hence we need a lock per path for truncate
exclusion....

We can take the MMAPLOCK briefly during IO setup (e.g. where we are
breaking layouts) but we have to drop it before calling into the
iomap code where the mmap_sem may be taken.....

> >> + * Synchronize [R]DMA before changing the file's block map. For pNFS,
> >> + * recall all layouts. For DAX, wait for transient DMA to complete. All
> >> + * other DMA is handled by pinning page cache pages.
> >> + *
> >> + * iolock must held XFS_IOLOCK_SHARED or XFS_IOLOCK_EXCL on entry and
> >> + * will be XFS_IOLOCK_EXCL and XFS_MMAPLOCK_EXCL on exit.
> >> + */
> >> +int xfs_sync_dma(
> >> +     struct inode            *inode,
> >> +     uint                    *iolock)
> >> +{
> >> +     struct xfs_inode        *ip = XFS_I(inode);
> >> +     int                     error;
> >> +
> >> +     while (true) {
> >> +             error = xfs_break_layouts(inode, iolock);
> >> +             if (error)
> >> +                     break;
> >> +
> >> +             xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> >> +             *iolock |= XFS_MMAPLOCK_EXCL;
> >> +
> >> +             error = dax_flush_dma(inode->i_mapping, xfs_wait_dax_page);
> >> +             if (error <= 0)
> >> +                     break;
> >> +             xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
> >> +             *iolock &= ~XFS_MMAPLOCK_EXCL;
> >> +     }
> >
> > At this level the loop seems sub-optimal. If we don't drop the
> > IOLOCK, then we have no reason to call xfs_break_layouts() a second
> > time.  Hence in isolation this loop doesn' make sense. Yes, I
> > realise that dax_flush_dma() can result in all locks on the inode
> > being dropped, but that's hidden in another function whose calling
> > scope is not at all obvious from this code.
> >
> > Also, xfs_wait_dax_page() assumes we have IOLOCK_EXCL held when it
> > is called. Nothing enforces the requirement that xfs_sync_dma() is
> > passed XFS_IOLOCK_EXCL, and so such assumptions cannot be made.
> > Even if it was, I really dislike the idea of a function that
> > /assumes/ lock state - that's a landmine that will bite us in the
> > rear end at some unexpected point in the future. If you need to
> > cycle held locks on an inode, you need to pass the held lock state
> > to the function.
> 
> I agree, and I thought about this, but at the time the callback is
> made the only way we could pass the lock context to
> xfs_wait_dax_page() would be to temporarily store it in the 'struct
> page' which seemed ugly at first glance.

I haven't looked at how you are implementing that callback, but it's
parameters are ... a bit strange. If we're waiting on a page, then
it should be passed the page, not an atomic_t....

Cheers,

Dave.
Christoph Hellwig Jan. 4, 2018, 8:33 a.m. UTC | #5
On Wed, Jan 03, 2018 at 10:00:27AM +1100, Dave Chinner wrote:
> AFAIC, these DMA references are just another external layout
> reference that needs to be broken.  IOWs, this "sync DMA" complexity
> needs to go inside xfs_break_layouts() as it is part of breaking the
> external reference to the file layout - it does not replace the
> layout breaking abstraction and so the implementation needs to
> reflect that.

Agreed.
Christoph Hellwig Jan. 4, 2018, 8:34 a.m. UTC | #6
On Wed, Jan 03, 2018 at 06:51:12PM +1100, Dave Chinner wrote:
> > "Additionally we call it during the write operation, where aren't
> > concerned about exposing unallocated blocks but just want to provide
> > basic synchronization between a local writer and pNFS clients.  mmap
> > writes would also benefit from this sort of synchronization, but due
> > to the tricky locking rules in the page fault path we don't bother."
> > ---
> 
> The pnfs code  went into 3.20 (4.0, IIRC), whilst the XFS_MMAPLOCK
> code went into 4.1. So the pnfs code was written and tested by
> Christoph a long time before I added the XFS_MMAPLOCK, despite them
> landing only one release apart. We've never really gone back to look
> at this because there hasn't been a need until now....

I suspect we should drop the MMAPLOCK as well, but I'd need to re-read
and re-test the code.
diff mbox

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7ceb41a9786a..f2cdc5a3eb6c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -129,6 +129,9 @@  xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_qm.o \
 				   xfs_quotaops.o
 
+# dax dma
+xfs-$(CONFIG_FS_DAX)		+= xfs_dma.o
+
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 
diff --git a/fs/xfs/xfs_dma.c b/fs/xfs/xfs_dma.c
new file mode 100644
index 000000000000..3df1a51a76c4
--- /dev/null
+++ b/fs/xfs/xfs_dma.c
@@ -0,0 +1,81 @@ 
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ */
+#include <linux/dax.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_pnfs.h"
+
+/*
+ * xfs_wait_dax_page - helper for dax_flush_dma to drop locks and sleep
+ * wait for a page idle event. Returns 1 if the locks did not need to be
+ * dropped and the page is idle, returns -EINTR if the sleep was
+ * interrupted and returns 1 when it slept. dax_flush_dma()
+ * retries/rescans all mappings when the lock is dropped.
+ */
+static int xfs_wait_dax_page(
+	atomic_t		*count,
+	unsigned int		mode)
+{
+	uint			iolock = XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL;
+	struct page 		*page = refcount_to_page(count);
+	struct address_space	*mapping = page->mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL));
+
+	if (page_ref_count(page) == 1)
+		return 0;
+
+	xfs_iunlock(ip, iolock);
+	schedule();
+	xfs_ilock(ip, iolock);
+
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 1;
+}
+
+/*
+ * Synchronize [R]DMA before changing the file's block map. For pNFS,
+ * recall all layouts. For DAX, wait for transient DMA to complete. All
+ * other DMA is handled by pinning page cache pages.
+ *
+ * iolock must held XFS_IOLOCK_SHARED or XFS_IOLOCK_EXCL on entry and
+ * will be XFS_IOLOCK_EXCL and XFS_MMAPLOCK_EXCL on exit.
+ */
+int xfs_sync_dma(
+	struct inode		*inode,
+	uint			*iolock)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	while (true) {
+		error = xfs_break_layouts(inode, iolock);
+		if (error)
+			break;
+
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		*iolock |= XFS_MMAPLOCK_EXCL;
+
+		error = dax_flush_dma(inode->i_mapping, xfs_wait_dax_page);
+		if (error <= 0)
+			break;
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+		*iolock &= ~XFS_MMAPLOCK_EXCL;
+	}
+
+	return error;
+}
diff --git a/fs/xfs/xfs_dma.h b/fs/xfs/xfs_dma.h
new file mode 100644
index 000000000000..29635639b073
--- /dev/null
+++ b/fs/xfs/xfs_dma.h
@@ -0,0 +1,24 @@ 
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ */
+#ifndef __XFS_DMA__
+#define __XFS_DMA__
+#ifdef CONFIG_FS_DAX
+int xfs_sync_dma(struct inode *inode, uint *iolock);
+#else
+#include "xfs_pnfs.h"
+
+static inline int xfs_sync_dma(struct inode *inode, uint *iolock)
+{
+	int error = xfs_break_layouts(inode, iolock);
+
+	if (error)
+		return error;
+
+	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_EXCL);
+	*iolock |= XFS_MMAPLOCK_EXCL;
+	return 0;
+}
+#endif /* CONFIG_FS_DAX */
+#endif /* __XFS_DMA__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6df0c133a61e..84fc178da656 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@ 
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_dma.h"
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
 
@@ -778,12 +779,11 @@  xfs_file_fallocate(
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_sync_dma(inode, &iolock);
 	if (error)
 		goto out_unlock;
 
-	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-	iolock |= XFS_MMAPLOCK_EXCL;
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		error = xfs_free_file_space(ip, offset, len);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 20dc65fef6a4..4340bef658b0 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,7 +39,7 @@ 
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
-#include "xfs_pnfs.h"
+#include "xfs_dma.h"
 #include "xfs_acl.h"
 #include "xfs_btree.h"
 #include <linux/fsmap.h>
@@ -643,12 +643,11 @@  xfs_ioc_space(
 		return error;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_sync_dma(inode, &iolock);
 	if (error)
 		goto out_unlock;
 
-	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-	iolock |= XFS_MMAPLOCK_EXCL;
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
 
 	switch (bf->l_whence) {
 	case 0: /*SEEK_SET*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 67bd97edc73b..c1055337b233 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,7 +37,7 @@ 
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
-#include "xfs_pnfs.h"
+#include "xfs_dma.h"
 #include "xfs_iomap.h"
 
 #include <linux/capability.h>
@@ -1030,11 +1030,12 @@  xfs_vn_setattr(
 		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 		uint			iolock = XFS_IOLOCK_EXCL;
 
-		error = xfs_break_layouts(d_inode(dentry), &iolock);
+		error = xfs_sync_dma(d_inode(dentry), &iolock);
 		if (error)
 			return error;
 
-		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
 		error = xfs_vn_setattr_size(dentry, iattr);
 		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 	} else {