diff mbox

xfs: unlock i_mutex in xfs_break_layouts

Message ID 1428420944-20965-1-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig April 7, 2015, 3:35 p.m. UTC
We want to drop all I/O path locks when recalling layouts, and that includes
i_mutex for the write path.  Without this we get stuck processe when recalls
take too long.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c  | 4 ++--
 fs/xfs/xfs_ioctl.c | 2 +-
 fs/xfs/xfs_iops.c  | 2 +-
 fs/xfs/xfs_pnfs.c  | 7 ++++++-
 fs/xfs/xfs_pnfs.h  | 2 +-
 5 files changed, 11 insertions(+), 6 deletions(-)

Comments

J. Bruce Fields April 7, 2015, 9:07 p.m. UTC | #1
On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> We want to drop all I/O path locks when recalling layouts, and that includes
> i_mutex for the write path.  Without this we get stuck processe when recalls
> take too long.

Also if the writer is an nfsd thread than we'd rather just error out
than wait.

--b.

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c  | 4 ++--
>  fs/xfs/xfs_ioctl.c | 2 +-
>  fs/xfs/xfs_iops.c  | 2 +-
>  fs/xfs/xfs_pnfs.c  | 7 ++++++-
>  fs/xfs/xfs_pnfs.h  | 2 +-
>  5 files changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a2e1cb8..963dfb2 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -559,7 +559,7 @@ restart:
>  	if (error)
>  		return error;
>  
> -	error = xfs_break_layouts(inode, iolock);
> +	error = xfs_break_layouts(inode, iolock, true);
>  	if (error)
>  		return error;
>  
> @@ -843,7 +843,7 @@ xfs_file_fallocate(
>  		return -EOPNOTSUPP;
>  
>  	xfs_ilock(ip, iolock);
> -	error = xfs_break_layouts(inode, &iolock);
> +	error = xfs_break_layouts(inode, &iolock, false);
>  	if (error)
>  		goto out_unlock;
>  
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index ac4feae..3a21cc7 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -639,7 +639,7 @@ xfs_ioc_space(
>  		return error;
>  
>  	xfs_ilock(ip, iolock);
> -	error = xfs_break_layouts(inode, &iolock);
> +	error = xfs_break_layouts(inode, &iolock, false);
>  	if (error)
>  		goto out_unlock;
>  
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index e53a903..79520c7 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -975,7 +975,7 @@ xfs_vn_setattr(
>  		uint		iolock = XFS_IOLOCK_EXCL;
>  
>  		xfs_ilock(ip, iolock);
> -		error = xfs_break_layouts(dentry->d_inode, &iolock);
> +		error = xfs_break_layouts(dentry->d_inode, &iolock, true);
>  		if (!error)
>  			error = xfs_setattr_size(ip, iattr);
>  		xfs_iunlock(ip, iolock);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 365dd57..981a657 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -31,7 +31,8 @@
>  int
>  xfs_break_layouts(
>  	struct inode		*inode,
> -	uint			*iolock)
> +	uint			*iolock,
> +	bool			with_imutex)
>  {
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	int			error;
> @@ -40,8 +41,12 @@ xfs_break_layouts(
>  
>  	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
>  		xfs_iunlock(ip, *iolock);
> +		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
> +			mutex_unlock(&inode->i_mutex);
>  		error = break_layout(inode, true);
>  		*iolock = XFS_IOLOCK_EXCL;
> +		if (with_imutex)
> +			mutex_lock(&inode->i_mutex);
>  		xfs_ilock(ip, *iolock);
>  	}
>  
> diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
> index b7fbfce..f749475 100644
> --- a/fs/xfs/xfs_pnfs.h
> +++ b/fs/xfs/xfs_pnfs.h
> @@ -8,7 +8,7 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
>  int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
>  		struct iattr *iattr);
>  
> -int xfs_break_layouts(struct inode *inode, uint *iolock);
> +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
>  #else
>  static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
>  {
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner April 7, 2015, 10:19 p.m. UTC | #2
On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> We want to drop all I/O path locks when recalling layouts, and that includes
> i_mutex for the write path.  Without this we get stuck processe when recalls
> take too long.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
.....
>  		xfs_iunlock(ip, iolock);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 365dd57..981a657 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -31,7 +31,8 @@
>  int
>  xfs_break_layouts(
>  	struct inode		*inode,
> -	uint			*iolock)
> +	uint			*iolock,
> +	bool			with_imutex)
>  {
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	int			error;
> @@ -40,8 +41,12 @@ xfs_break_layouts(
>  
>  	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
>  		xfs_iunlock(ip, *iolock);
> +		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
> +			mutex_unlock(&inode->i_mutex);
>  		error = break_layout(inode, true);
>  		*iolock = XFS_IOLOCK_EXCL;
> +		if (with_imutex)
> +			mutex_lock(&inode->i_mutex);
>  		xfs_ilock(ip, *iolock);
>  	}

That's kinda nasty, and it has no documentation explaining when or
why we'd need to drop the i_mutex. How are we supposed to know if we
need to drop the i_mutex or not? What happens if the upper VFS
layers change or we have a multiple call paths that have different
i_mutex contexts (i.e. one holds, another doesn't)?

Which makes me wonder - is this layout breaking stuff at the right
layer?

Cheers,

Dave.
Christoph Hellwig April 8, 2015, 4:21 p.m. UTC | #3
On Tue, Apr 07, 2015 at 05:07:47PM -0400, J. Bruce Fields wrote:
> On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> > We want to drop all I/O path locks when recalling layouts, and that includes
> > i_mutex for the write path.  Without this we get stuck processe when recalls
> > take too long.
> 
> Also if the writer is an nfsd thread than we'd rather just error out
> than wait.

We have no way to know we are called by nfsd here unfortunately.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig April 8, 2015, 4:24 p.m. UTC | #4
On Wed, Apr 08, 2015 at 08:19:27AM +1000, Dave Chinner wrote:
> That's kinda nasty, and it has no documentation explaining when or
> why we'd need to drop the i_mutex. How are we supposed to know if we
> need to drop the i_mutex or not?

We need to drop it if we hold it, pretty easy.

> What happens if the upper VFS
> layers change or we have a multiple call paths that have different
> i_mutex contexts (i.e. one holds, another doesn't)?

We avoid this in the VFS, as everytime we had it filesystems were getting it
wrong.

However you have a point in that we should probably have asserts that the
right locks are held.

> Which makes me wonder - is this layout breaking stuff at the right
> layer?

We can't do it in the VFS as it needs to be atomic vs the lock that
protects write in ->write and ->fallocate, which is only taken in
the filesystem.  For ->setattr in theory we could do it in the VFS,
but if the other callers can't do it in the VFS that will just lead
to code duplication.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields April 8, 2015, 6:16 p.m. UTC | #5
On Wed, Apr 08, 2015 at 06:21:04PM +0200, Christoph Hellwig wrote:
> On Tue, Apr 07, 2015 at 05:07:47PM -0400, J. Bruce Fields wrote:
> > On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> > > We want to drop all I/O path locks when recalling layouts, and that includes
> > > i_mutex for the write path.  Without this we get stuck processe when recalls
> > > take too long.
> > 
> > Also if the writer is an nfsd thread than we'd rather just error out
> > than wait.

(To be clear: ACK to this patch as far as I'm concerned, I've got
another concern but we need this fix regardless.)

> We have no way to know we are called by nfsd here unfortunately.

I was imagining the possible deadlock here as mostly theoretical, but
now that I think of it it doesn't sound unlikely at all:

	- file is under heavy write load
	- conflicting operation breaks layout
	- nfsd threads all block in writes to that file
	- no nfsd threads available to service layout return
	- recall times out, client fenced.

Ugh.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8..963dfb2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@  restart:
 	if (error)
 		return error;
 
-	error = xfs_break_layouts(inode, iolock);
+	error = xfs_break_layouts(inode, iolock, true);
 	if (error)
 		return error;
 
@@ -843,7 +843,7 @@  xfs_file_fallocate(
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_break_layouts(inode, &iolock, false);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae..3a21cc7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -639,7 +639,7 @@  xfs_ioc_space(
 		return error;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock);
+	error = xfs_break_layouts(inode, &iolock, false);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e53a903..79520c7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -975,7 +975,7 @@  xfs_vn_setattr(
 		uint		iolock = XFS_IOLOCK_EXCL;
 
 		xfs_ilock(ip, iolock);
-		error = xfs_break_layouts(dentry->d_inode, &iolock);
+		error = xfs_break_layouts(dentry->d_inode, &iolock, true);
 		if (!error)
 			error = xfs_setattr_size(ip, iattr);
 		xfs_iunlock(ip, iolock);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 365dd57..981a657 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@ 
 int
 xfs_break_layouts(
 	struct inode		*inode,
-	uint			*iolock)
+	uint			*iolock,
+	bool			with_imutex)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
@@ -40,8 +41,12 @@  xfs_break_layouts(
 
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
+		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+			mutex_unlock(&inode->i_mutex);
 		error = break_layout(inode, true);
 		*iolock = XFS_IOLOCK_EXCL;
+		if (with_imutex)
+			mutex_lock(&inode->i_mutex);
 		xfs_ilock(ip, *iolock);
 	}
 
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b7fbfce..f749475 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,7 +8,7 @@  int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
 		struct iattr *iattr);
 
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
 #else
 static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
 {