diff mbox

[v9,9/9] xfs, dax: introduce xfs_break_dax_layouts()

Message ID 152461283072.17530.11313844322317294220.stgit@dwillia2-desk3.amr.corp.intel.com
State New, archived
Headers show

Commit Message

Dan Williams April 24, 2018, 11:33 p.m. UTC
xfs_break_dax_layouts(), similar to xfs_break_leased_layouts(), scans
for busy / pinned dax pages and waits for those pages to go idle before
any potential extent unmap operation.

dax_layout_busy_page() handles synchronizing against new page-busy
events (get_user_pages). It invalidates all mappings to trigger the
get_user_pages slow path which will eventually block on the xfs inode
lock held in XFS_MMAPLOCK_EXCL mode. If dax_layout_busy_page() finds a
busy page it returns it for xfs to wait for the page-idle event that
will fire when the page reference count reaches 1 (recall ZONE_DEVICE
pages are idle at count 1, see generic_dax_pagefree()).

While waiting, the XFS_MMAPLOCK_EXCL lock is dropped in order to not
deadlock the process that might be trying to elevate the page count of
more pages before arranging for any of them to go idle. I.e. the typical
case of submitting I/O is that iov_iter_get_pages() elevates the
reference count of all pages in the I/O before starting I/O on the first
page. The process of elevating the reference count of all pages involved
in an I/O may cause faults that need to take XFS_MMAPLOCK_EXCL.

Although XFS_MMAPLOCK_EXCL is dropped while waiting, XFS_IOLOCK_EXCL is
held while sleeping. We need this to prevent starvation of the truncate
path as continuous submission of direct-I/O could starve the truncate
path indefinitely if the lock is dropped.

Cc: Dave Chinner <david@fromorbit.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c |   59 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 11 deletions(-)

Comments

Jan Kara May 9, 2018, 12:27 p.m. UTC | #1
On Tue 24-04-18 16:33:50, Dan Williams wrote:
> xfs_break_dax_layouts(), similar to xfs_break_leased_layouts(), scans
> for busy / pinned dax pages and waits for those pages to go idle before
> any potential extent unmap operation.
> 
> dax_layout_busy_page() handles synchronizing against new page-busy
> events (get_user_pages). It invalidates all mappings to trigger the
> get_user_pages slow path which will eventually block on the xfs inode
> lock held in XFS_MMAPLOCK_EXCL mode. If dax_layout_busy_page() finds a
> busy page it returns it for xfs to wait for the page-idle event that
> will fire when the page reference count reaches 1 (recall ZONE_DEVICE
> pages are idle at count 1, see generic_dax_pagefree()).
> 
> While waiting, the XFS_MMAPLOCK_EXCL lock is dropped in order to not
> deadlock the process that might be trying to elevate the page count of
> more pages before arranging for any of them to go idle. I.e. the typical
> case of submitting I/O is that iov_iter_get_pages() elevates the
> reference count of all pages in the I/O before starting I/O on the first
> page. The process of elevating the reference count of all pages involved
> in an I/O may cause faults that need to take XFS_MMAPLOCK_EXCL.
> 
> Although XFS_MMAPLOCK_EXCL is dropped while waiting, XFS_IOLOCK_EXCL is
> held while sleeping. We need this to prevent starvation of the truncate
> path as continuous submission of direct-I/O could starve the truncate
> path indefinitely if the lock is dropped.
> 
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Reported-by: Jan Kara <jack@suse.cz>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Looks good to me except some nits below. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

for as much as it is worth with XFS code ;)

> +static int
> +xfs_break_dax_layouts(
> +	struct inode		*inode,
> +	uint			iolock,
> +	bool			*did_unlock)
> +{
> +	struct page		*page;
> +
> +	*did_unlock = false;
> +	page = dax_layout_busy_page(inode->i_mapping);
> +	if (!page)
> +		return 0;
> +
> +	return ___wait_var_event(&page->_refcount,
> +			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
> +			0, 0, xfs_wait_dax_page(inode, did_unlock));
> +}
> +
>  int
>  xfs_break_layouts(
>  	struct inode		*inode,
> @@ -729,17 +760,23 @@ xfs_break_layouts(
>  
>  	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
>  
> -	switch (reason) {
> -	case BREAK_UNMAP:
> -		ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
> -		/* fall through */
> -	case BREAK_WRITE:
> -		error = xfs_break_leased_layouts(inode, iolock, &retry);
> -		break;
> -	default:
> -		WARN_ON_ONCE(1);
> -		return -EINVAL;
> -	}
> +	do {
> +		switch (reason) {
> +		case BREAK_UNMAP:
> +			ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));

Maybe move the assertion to xfs_break_dax_layouts()?

> +
> +			error = xfs_break_dax_layouts(inode, *iolock, &retry);
> +			/* fall through */
> +		case BREAK_WRITE:
> +			if (error || retry)
> +				break;

The error handling IMHO belongs above the 'fall through' comment above.

> +			error = xfs_break_leased_layouts(inode, iolock, &retry);
> +			break;
> +		default:
> +			WARN_ON_ONCE(1);
> +			return -EINVAL;
> +		}
> +	} while (error == 0 && retry);

As a general 'taste' comment, I prefer if the 'retry' is always initialized
to 'false' at the beginning of the loop body in these kinds of loops. That
way it is obvious we are doing the right thing when looking at the loop
body and we don't have to verify that each case statement initializes
'retry' properly (in fact I'd remove the initialization from
xfs_break_dax_layouts() and xfs_break_leased_layouts()). But this is more a
matter of taste and consistency with other code in the area so I defer to
XFS maintainers for a final opinion. Darrick?

								Honza
Darrick J. Wong May 9, 2018, 2:38 p.m. UTC | #2
On Tue, Apr 24, 2018 at 04:33:50PM -0700, Dan Williams wrote:
> xfs_break_dax_layouts(), similar to xfs_break_leased_layouts(), scans
> for busy / pinned dax pages and waits for those pages to go idle before
> any potential extent unmap operation.
> 
> dax_layout_busy_page() handles synchronizing against new page-busy
> events (get_user_pages). It invalidates all mappings to trigger the
> get_user_pages slow path which will eventually block on the xfs inode
> lock held in XFS_MMAPLOCK_EXCL mode. If dax_layout_busy_page() finds a
> busy page it returns it for xfs to wait for the page-idle event that
> will fire when the page reference count reaches 1 (recall ZONE_DEVICE
> pages are idle at count 1, see generic_dax_pagefree()).
> 
> While waiting, the XFS_MMAPLOCK_EXCL lock is dropped in order to not
> deadlock the process that might be trying to elevate the page count of
> more pages before arranging for any of them to go idle. I.e. the typical
> case of submitting I/O is that iov_iter_get_pages() elevates the
> reference count of all pages in the I/O before starting I/O on the first
> page. The process of elevating the reference count of all pages involved
> in an I/O may cause faults that need to take XFS_MMAPLOCK_EXCL.
> 
> Although XFS_MMAPLOCK_EXCL is dropped while waiting, XFS_IOLOCK_EXCL is
> held while sleeping. We need this to prevent starvation of the truncate
> path as continuous submission of direct-I/O could starve the truncate
> path indefinitely if the lock is dropped.
> 
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Reported-by: Jan Kara <jack@suse.cz>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

I should've acked this explicitly since it's xfs code,
Acked-by: Darrick J. Wong <darrick.wong@oracle.com>

The rest of it looks fine enough to me too, but there's no
Acked-by-goober tag to put on them. :P

--D

> ---
>  fs/xfs/xfs_file.c |   59 +++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 48 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 1a5176b21803..4e98d0dcc035 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -718,6 +718,37 @@ xfs_file_write_iter(
>  	return ret;
>  }
>  
> +static void
> +xfs_wait_dax_page(
> +	struct inode		*inode,
> +	bool			*did_unlock)
> +{
> +	struct xfs_inode        *ip = XFS_I(inode);
> +
> +	*did_unlock = true;
> +	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
> +	schedule();
> +	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> +}
> +
> +static int
> +xfs_break_dax_layouts(
> +	struct inode		*inode,
> +	uint			iolock,
> +	bool			*did_unlock)
> +{
> +	struct page		*page;
> +
> +	*did_unlock = false;
> +	page = dax_layout_busy_page(inode->i_mapping);
> +	if (!page)
> +		return 0;
> +
> +	return ___wait_var_event(&page->_refcount,
> +			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
> +			0, 0, xfs_wait_dax_page(inode, did_unlock));
> +}
> +
>  int
>  xfs_break_layouts(
>  	struct inode		*inode,
> @@ -729,17 +760,23 @@ xfs_break_layouts(
>  
>  	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
>  
> -	switch (reason) {
> -	case BREAK_UNMAP:
> -		ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
> -		/* fall through */
> -	case BREAK_WRITE:
> -		error = xfs_break_leased_layouts(inode, iolock, &retry);
> -		break;
> -	default:
> -		WARN_ON_ONCE(1);
> -		return -EINVAL;
> -	}
> +	do {
> +		switch (reason) {
> +		case BREAK_UNMAP:
> +			ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
> +
> +			error = xfs_break_dax_layouts(inode, *iolock, &retry);
> +			/* fall through */
> +		case BREAK_WRITE:
> +			if (error || retry)
> +				break;
> +			error = xfs_break_leased_layouts(inode, iolock, &retry);
> +			break;
> +		default:
> +			WARN_ON_ONCE(1);
> +			return -EINVAL;
> +		}
> +	} while (error == 0 && retry);
>  
>  	return error;
>  }
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams May 9, 2018, 10:54 p.m. UTC | #3
On Wed, May 9, 2018 at 5:27 AM, Jan Kara <jack@suse.cz> wrote:
> On Tue 24-04-18 16:33:50, Dan Williams wrote:
>> xfs_break_dax_layouts(), similar to xfs_break_leased_layouts(), scans
>> for busy / pinned dax pages and waits for those pages to go idle before
>> any potential extent unmap operation.
>>
>> dax_layout_busy_page() handles synchronizing against new page-busy
>> events (get_user_pages). It invalidates all mappings to trigger the
>> get_user_pages slow path which will eventually block on the xfs inode
>> lock held in XFS_MMAPLOCK_EXCL mode. If dax_layout_busy_page() finds a
>> busy page it returns it for xfs to wait for the page-idle event that
>> will fire when the page reference count reaches 1 (recall ZONE_DEVICE
>> pages are idle at count 1, see generic_dax_pagefree()).
>>
>> While waiting, the XFS_MMAPLOCK_EXCL lock is dropped in order to not
>> deadlock the process that might be trying to elevate the page count of
>> more pages before arranging for any of them to go idle. I.e. the typical
>> case of submitting I/O is that iov_iter_get_pages() elevates the
>> reference count of all pages in the I/O before starting I/O on the first
>> page. The process of elevating the reference count of all pages involved
>> in an I/O may cause faults that need to take XFS_MMAPLOCK_EXCL.
>>
>> Although XFS_MMAPLOCK_EXCL is dropped while waiting, XFS_IOLOCK_EXCL is
>> held while sleeping. We need this to prevent starvation of the truncate
>> path as continuous submission of direct-I/O could starve the truncate
>> path indefinitely if the lock is dropped.
>>
>> Cc: Dave Chinner <david@fromorbit.com>
>> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Reported-by: Jan Kara <jack@suse.cz>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> Looks good to me except some nits below. Feel free to add:
>
> Reviewed-by: Jan Kara <jack@suse.cz>
>
> for as much as it is worth with XFS code ;)
>
>> +static int
>> +xfs_break_dax_layouts(
>> +     struct inode            *inode,
>> +     uint                    iolock,
>> +     bool                    *did_unlock)
>> +{
>> +     struct page             *page;
>> +
>> +     *did_unlock = false;
>> +     page = dax_layout_busy_page(inode->i_mapping);
>> +     if (!page)
>> +             return 0;
>> +
>> +     return ___wait_var_event(&page->_refcount,
>> +                     atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
>> +                     0, 0, xfs_wait_dax_page(inode, did_unlock));
>> +}
>> +
>>  int
>>  xfs_break_layouts(
>>       struct inode            *inode,
>> @@ -729,17 +760,23 @@ xfs_break_layouts(
>>
>>       ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
>>
>> -     switch (reason) {
>> -     case BREAK_UNMAP:
>> -             ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
>> -             /* fall through */
>> -     case BREAK_WRITE:
>> -             error = xfs_break_leased_layouts(inode, iolock, &retry);
>> -             break;
>> -     default:
>> -             WARN_ON_ONCE(1);
>> -             return -EINVAL;
>> -     }
>> +     do {
>> +             switch (reason) {
>> +             case BREAK_UNMAP:
>> +                     ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
>
> Maybe move the assertion to xfs_break_dax_layouts()?

Makes sense, sure.

>
>> +
>> +                     error = xfs_break_dax_layouts(inode, *iolock, &retry);
>> +                     /* fall through */
>> +             case BREAK_WRITE:
>> +                     if (error || retry)
>> +                             break;
>
> The error handling IMHO belongs above the 'fall through' comment above.

Ok, yes I think this location is a stale holdover.

>
>> +                     error = xfs_break_leased_layouts(inode, iolock, &retry);
>> +                     break;
>> +             default:
>> +                     WARN_ON_ONCE(1);
>> +                     return -EINVAL;
>> +             }
>> +     } while (error == 0 && retry);
>
> As a general 'taste' comment, I prefer if the 'retry' is always initialized
> to 'false' at the beginning of the loop body in these kinds of loops. That
> way it is obvious we are doing the right thing when looking at the loop
> body and we don't have to verify that each case statement initializes
> 'retry' properly (in fact I'd remove the initialization from
> xfs_break_dax_layouts() and xfs_break_leased_layouts()). But this is more a
> matter of taste and consistency with other code in the area so I defer to
> XFS maintainers for a final opinion. Darrick?

I'm fine with making this change, I'll proceed unless / until Darrick says no.
diff mbox

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1a5176b21803..4e98d0dcc035 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -718,6 +718,37 @@  xfs_file_write_iter(
 	return ret;
 }
 
+static void
+xfs_wait_dax_page(
+	struct inode		*inode,
+	bool			*did_unlock)
+{
+	struct xfs_inode        *ip = XFS_I(inode);
+
+	*did_unlock = true;
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	schedule();
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+}
+
+static int
+xfs_break_dax_layouts(
+	struct inode		*inode,
+	uint			iolock,
+	bool			*did_unlock)
+{
+	struct page		*page;
+
+	*did_unlock = false;
+	page = dax_layout_busy_page(inode->i_mapping);
+	if (!page)
+		return 0;
+
+	return ___wait_var_event(&page->_refcount,
+			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+			0, 0, xfs_wait_dax_page(inode, did_unlock));
+}
+
 int
 xfs_break_layouts(
 	struct inode		*inode,
@@ -729,17 +760,23 @@  xfs_break_layouts(
 
 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 
-	switch (reason) {
-	case BREAK_UNMAP:
-		ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
-		/* fall through */
-	case BREAK_WRITE:
-		error = xfs_break_leased_layouts(inode, iolock, &retry);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		return -EINVAL;
-	}
+	do {
+		switch (reason) {
+		case BREAK_UNMAP:
+			ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+
+			error = xfs_break_dax_layouts(inode, *iolock, &retry);
+			/* fall through */
+		case BREAK_WRITE:
+			if (error || retry)
+				break;
+			error = xfs_break_leased_layouts(inode, iolock, &retry);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
+	} while (error == 0 && retry);
 
 	return error;
 }