diff mbox series

[v2] fuse: fix deadlock between atomic O_TRUNC open() and page invalidations

Message ID 20211229040239.66075-1-zhangjiachen.jaycee@bytedance.com (mailing list archive)
State New, archived
Headers show
Series [v2] fuse: fix deadlock between atomic O_TRUNC open() and page invalidations | expand

Commit Message

Jiachen Zhang Dec. 29, 2021, 4:02 a.m. UTC
fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
deadlock related to the case above, which will cause the xfstests
generic/464 testcase hung in our virtio-fs test environment.

For example, consider two processes concurrently open one same file, one
with O_TRUNC and another without O_TRUNC. The deadlock case is described
below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
to lock a page (acquiring B), open() could have held the page lock
(acquired B), and waiting on the page writeback (acquiring A). This would
lead to deadlocks.

open(O_TRUNC)
----------------------------------------------------------------
fuse_open_common
  inode_lock            [C acquire]
  fuse_set_nowrite      [A acquire]

  fuse_finish_open
    truncate_pagecache
      lock_page         [B acquire]
      truncate_inode_page
      unlock_page       [B release]

  fuse_release_nowrite  [A release]
  inode_unlock          [C release]
----------------------------------------------------------------

open()
----------------------------------------------------------------
fuse_open_common
  fuse_finish_open
    invalidate_inode_pages2
      lock_page         [B acquire]
	fuse_launder_page
	  fuse_wait_on_page_writeback [A acquire & release]
      unlock_page       [B release]
----------------------------------------------------------------

Besides this case, all calls of invalidate_inode_pages2() and
invalidate_inode_pages2_range() in fuse code also can deadlock with
open(O_TRUNC). This commit tries to fix it by adding a new lock,
atomic_o_trunc, to protect the areas with the A-B-B-A deadlock risk.

Signed-off-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
---
 fs/fuse/dax.c    |  4 ++--
 fs/fuse/dir.c    |  2 +-
 fs/fuse/file.c   | 28 ++++++++++++++++++++++++++--
 fs/fuse/fuse_i.h |  7 +++++++
 fs/fuse/inode.c  |  7 ++++---
 5 files changed, 40 insertions(+), 8 deletions(-)

Comments

Miklos Szeredi Feb. 23, 2022, 8:50 a.m. UTC | #1
On Wed, Dec 29, 2021 at 12:02:39PM +0800, Jiachen Zhang wrote:
> fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
> O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
> atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
> in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
> deadlock related to the case above, which will cause the xfstests
> generic/464 testcase hung in our virtio-fs test environment.
> 
> For example, consider two processes concurrently open one same file, one
> with O_TRUNC and another without O_TRUNC. The deadlock case is described
> below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
> to lock a page (acquiring B), open() could have held the page lock
> (acquired B), and waiting on the page writeback (acquiring A). This would
> lead to deadlocks.
> 
> open(O_TRUNC)
> ----------------------------------------------------------------
> fuse_open_common
>   inode_lock            [C acquire]
>   fuse_set_nowrite      [A acquire]
> 
>   fuse_finish_open
>     truncate_pagecache
>       lock_page         [B acquire]
>       truncate_inode_page
>       unlock_page       [B release]
> 
>   fuse_release_nowrite  [A release]
>   inode_unlock          [C release]
> ----------------------------------------------------------------
> 
> open()
> ----------------------------------------------------------------
> fuse_open_common
>   fuse_finish_open
>     invalidate_inode_pages2
>       lock_page         [B acquire]
> 	fuse_launder_page
> 	  fuse_wait_on_page_writeback [A acquire & release]
>       unlock_page       [B release]
> ----------------------------------------------------------------
> 
> Besides this case, all calls of invalidate_inode_pages2() and
> invalidate_inode_pages2_range() in fuse code also can deadlock with
> open(O_TRUNC). This commit tries to fix it by adding a new lock,
> atomic_o_trunc, to protect the areas with the A-B-B-A deadlock risk.

Thanks.  Can you please try the following patch?  Instead of introducing a new
lock it tries to fix this by moving the truncate_pagecache() out of the nowrite
protected section.

Thanks,
Miklos

---
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 656e921f3506..56f439719129 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -537,6 +537,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	struct fuse_file *ff;
 	void *security_ctx = NULL;
 	u32 security_ctxlen;
+	bool trunc = flags & O_TRUNC;
 
 	/* Userspace expects S_IFREG in create mode */
 	BUG_ON((mode & S_IFMT) != S_IFREG);
@@ -561,7 +562,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	inarg.mode = mode;
 	inarg.umask = current_umask();
 
-	if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
+	if (fm->fc->handle_killpriv_v2 && trunc &&
 	    !(flags & O_EXCL) && !capable(CAP_FSETID)) {
 		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
 	}
@@ -623,6 +624,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	} else {
 		file->private_data = ff;
 		fuse_finish_open(inode, file);
+		if (fm->fc->atomic_o_trunc && trunc)
+			truncate_pagecache(inode, 0);
 	}
 	return err;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 829094451774..2e041708ef44 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -210,7 +210,6 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 		i_size_write(inode, 0);
 		spin_unlock(&fi->lock);
-		truncate_pagecache(inode, 0);
 		file_update_time(file);
 		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
 	} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
@@ -239,30 +238,32 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	if (err)
 		return err;
 
-	if (is_wb_truncate || dax_truncate) {
+	if (is_wb_truncate || dax_truncate)
 		inode_lock(inode);
-		fuse_set_nowrite(inode);
-	}
 
 	if (dax_truncate) {
 		filemap_invalidate_lock(inode->i_mapping);
 		err = fuse_dax_break_layouts(inode, 0, 0);
 		if (err)
-			goto out;
+			goto out_inode_unlock;
 	}
 
+	if (is_wb_truncate || dax_truncate)
+		fuse_set_nowrite(inode);
+
 	err = fuse_do_open(fm, get_node_id(inode), file, isdir);
 	if (!err)
 		fuse_finish_open(inode, file);
 
-out:
+	if (is_wb_truncate | dax_truncate)
+		fuse_release_nowrite(inode);
+	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+		truncate_pagecache(inode, 0);
 	if (dax_truncate)
 		filemap_invalidate_unlock(inode->i_mapping);
-
-	if (is_wb_truncate | dax_truncate) {
-		fuse_release_nowrite(inode);
+out_inode_unlock:
+	if (is_wb_truncate | dax_truncate)
 		inode_unlock(inode);
-	}
 
 	return err;
 }
Jiachen Zhang March 4, 2022, 6:23 a.m. UTC | #2
On Wed, Feb 23, 2022 at 4:50 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Wed, Dec 29, 2021 at 12:02:39PM +0800, Jiachen Zhang wrote:
> > fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
> > O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
> > atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
> > in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
> > deadlock related to the case above, which will cause the xfstests
> > generic/464 testcase hung in our virtio-fs test environment.
> >
> > For example, consider two processes concurrently open one same file, one
> > with O_TRUNC and another without O_TRUNC. The deadlock case is described
> > below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
> > to lock a page (acquiring B), open() could have held the page lock
> > (acquired B), and waiting on the page writeback (acquiring A). This would
> > lead to deadlocks.
> >
> > open(O_TRUNC)
> > ----------------------------------------------------------------
> > fuse_open_common
> >   inode_lock            [C acquire]
> >   fuse_set_nowrite      [A acquire]
> >
> >   fuse_finish_open
> >     truncate_pagecache
> >       lock_page         [B acquire]
> >       truncate_inode_page
> >       unlock_page       [B release]
> >
> >   fuse_release_nowrite  [A release]
> >   inode_unlock          [C release]
> > ----------------------------------------------------------------
> >
> > open()
> > ----------------------------------------------------------------
> > fuse_open_common
> >   fuse_finish_open
> >     invalidate_inode_pages2
> >       lock_page         [B acquire]
> >       fuse_launder_page
> >         fuse_wait_on_page_writeback [A acquire & release]
> >       unlock_page       [B release]
> > ----------------------------------------------------------------
> >
> > Besides this case, all calls of invalidate_inode_pages2() and
> > invalidate_inode_pages2_range() in fuse code also can deadlock with
> > open(O_TRUNC). This commit tries to fix it by adding a new lock,
> > atomic_o_trunc, to protect the areas with the A-B-B-A deadlock risk.
>
> Thanks.  Can you please try the following patch?  Instead of introducing a new
> lock it tries to fix this by moving the truncate_pagecache() out of the nowrite
> protected section.
>
> Thanks,
> Miklos
>
> ---
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 656e921f3506..56f439719129 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -537,6 +537,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
>         struct fuse_file *ff;
>         void *security_ctx = NULL;
>         u32 security_ctxlen;
> +       bool trunc = flags & O_TRUNC;
>
>         /* Userspace expects S_IFREG in create mode */
>         BUG_ON((mode & S_IFMT) != S_IFREG);
> @@ -561,7 +562,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
>         inarg.mode = mode;
>         inarg.umask = current_umask();
>
> -       if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
> +       if (fm->fc->handle_killpriv_v2 && trunc &&
>             !(flags & O_EXCL) && !capable(CAP_FSETID)) {
>                 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
>         }
> @@ -623,6 +624,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
>         } else {
>                 file->private_data = ff;
>                 fuse_finish_open(inode, file);
> +               if (fm->fc->atomic_o_trunc && trunc)
> +                       truncate_pagecache(inode, 0);
>         }
>         return err;
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 829094451774..2e041708ef44 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -210,7 +210,6 @@ void fuse_finish_open(struct inode *inode, struct file *file)
>                 fi->attr_version = atomic64_inc_return(&fc->attr_version);
>                 i_size_write(inode, 0);
>                 spin_unlock(&fi->lock);
> -               truncate_pagecache(inode, 0);
>                 file_update_time(file);
>                 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
>         } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
> @@ -239,30 +238,32 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>         if (err)
>                 return err;
>
> -       if (is_wb_truncate || dax_truncate) {
> +       if (is_wb_truncate || dax_truncate)
>                 inode_lock(inode);
> -               fuse_set_nowrite(inode);
> -       }
>
>         if (dax_truncate) {
>                 filemap_invalidate_lock(inode->i_mapping);
>                 err = fuse_dax_break_layouts(inode, 0, 0);
>                 if (err)
> -                       goto out;
> +                       goto out_inode_unlock;
>         }
>
> +       if (is_wb_truncate || dax_truncate)
> +               fuse_set_nowrite(inode);
> +
>         err = fuse_do_open(fm, get_node_id(inode), file, isdir);
>         if (!err)
>                 fuse_finish_open(inode, file);
>
> -out:
> +       if (is_wb_truncate | dax_truncate)
> +               fuse_release_nowrite(inode);
> +       if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
> +               truncate_pagecache(inode, 0);
>         if (dax_truncate)
>                 filemap_invalidate_unlock(inode->i_mapping);
> -
> -       if (is_wb_truncate | dax_truncate) {
> -               fuse_release_nowrite(inode);
> +out_inode_unlock:
> +       if (is_wb_truncate | dax_truncate)
>                 inode_unlock(inode);
> -       }
>
>         return err;
>  }

Hi, Miklos,

I tested this fix, and it did pass the xfstests generic/464 in our
environment. However, if I understand correctly, one of the usages of
the nowrite is to protect file truncation, as said in the commit
message of e4648309b85a78f8c787457832269a8712a8673e. So, does that
mean this fix may introduce some other problems?

Thanks,
Jiachen
Miklos Szeredi March 4, 2022, 3:30 p.m. UTC | #3
On Fri, 4 Mar 2022 at 07:23, Jiachen Zhang
<zhangjiachen.jaycee@bytedance.com> wrote:

> I tested this fix, and it did pass the xfstests generic/464 in our

Thanks for testing!

> environment. However, if I understand correctly, one of the usages of
> the nowrite is to protect file truncation, as said in the commit
> message of e4648309b85a78f8c787457832269a8712a8673e. So, does that
> mean this fix may introduce some other problems?

That's an excellent question.  I don't think this will cause an issue,
since the nowrite protection is for truncation of the file on the
server (userspace) side.   The inode lock still protects concurrent
writes against page cache truncation in the writeback cache case.   In
the non-writeback cache case the nowrite protection does not do
anything.

Thanks,
Miklos
Jiachen Zhang March 7, 2022, 7:55 a.m. UTC | #4
On Fri, Mar 4, 2022 at 11:30 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Fri, 4 Mar 2022 at 07:23, Jiachen Zhang
> <zhangjiachen.jaycee@bytedance.com> wrote:
>
> > I tested this fix, and it did pass the xfstests generic/464 in our
>
> Thanks for testing!
>
> > environment. However, if I understand correctly, one of the usages of
> > the nowrite is to protect file truncation, as said in the commit
> > message of e4648309b85a78f8c787457832269a8712a8673e. So, does that
> > mean this fix may introduce some other problems?
>
> That's an excellent question.  I don't think this will cause an issue,
> since the nowrite protection is for truncation of the file on the
> server (userspace) side.   The inode lock still protects concurrent
> writes against page cache truncation in the writeback cache case.   In
> the non-writeback cache case the nowrite protection does not do
> anything.

Got it. So the nowrite is protecting O_TRUNC FUSE_OPEN (or truncating
FUSE_SETATTR) against FUSE_WRITE in writeback_cache mode? Then this
patch looks good to me.

Thanks,
Jiachen

>
> Thanks,
> Miklos
diff mbox series

Patch

diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 182b24a14804..e5203d61698c 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -878,11 +878,11 @@  static int dmap_writeback_invalidate(struct inode *inode,
 		return ret;
 	}
 
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
+	ret = fuse_invalidate_inode_pages_range(inode,
 					    start_pos >> PAGE_SHIFT,
 					    end_pos >> PAGE_SHIFT);
 	if (ret)
-		pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+		pr_debug("fuse: fuse_invalidate_inode_pages_range() failed err=%d\n",
 			 ret);
 
 	return ret;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 656e921f3506..d6d5dcd3cf1e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1778,7 +1778,7 @@  int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	if ((is_truncate || !is_wb) &&
 	    S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
 		truncate_pagecache(inode, outarg.attr.size);
-		invalidate_inode_pages2(mapping);
+		fuse_invalidate_inode_pages(inode);
 	}
 
 	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 829094451774..1dde21bad53c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -124,6 +124,28 @@  static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 	}
 }
 
+int fuse_invalidate_inode_pages_range(struct inode *inode, pgoff_t start,
+					pgoff_t end)
+{
+	int ret;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	bool may_truncate = fc->atomic_o_trunc &&
+			    (fc->writeback_cache || FUSE_IS_DAX(inode));
+
+	if (may_truncate)
+		mutex_lock(&get_fuse_inode(inode)->atomic_trunc_mutex);
+	ret = invalidate_inode_pages2_range(inode->i_mapping, start, end);
+	if (may_truncate)
+		mutex_unlock(&get_fuse_inode(inode)->atomic_trunc_mutex);
+
+	return ret;
+}
+
+int fuse_invalidate_inode_pages(struct inode *inode)
+{
+	return fuse_invalidate_inode_pages_range(inode, 0, -1);
+}
+
 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 				 unsigned int open_flags, bool isdir)
 {
@@ -214,7 +236,7 @@  void fuse_finish_open(struct inode *inode, struct file *file)
 		file_update_time(file);
 		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
 	} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
-		invalidate_inode_pages2(inode->i_mapping);
+		fuse_invalidate_inode_pages(inode);
 	}
 
 	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
@@ -241,6 +263,7 @@  int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 
 	if (is_wb_truncate || dax_truncate) {
 		inode_lock(inode);
+		mutex_lock(&get_fuse_inode(inode)->atomic_trunc_mutex);
 		fuse_set_nowrite(inode);
 	}
 
@@ -261,6 +284,7 @@  int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 
 	if (is_wb_truncate | dax_truncate) {
 		fuse_release_nowrite(inode);
+		mutex_unlock(&get_fuse_inode(inode)->atomic_trunc_mutex);
 		inode_unlock(inode);
 	}
 
@@ -2408,7 +2432,7 @@  static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 		if (vma->vm_flags & VM_MAYSHARE)
 			return -ENODEV;
 
-		invalidate_inode_pages2(file->f_mapping);
+		fuse_invalidate_inode_pages(file_inode(file));
 
 		return generic_file_mmap(file, vma);
 	}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e8e59fbdefeb..ea293d0347a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -149,6 +149,9 @@  struct fuse_inode {
 	/** Lock to protect write related fields */
 	spinlock_t lock;
 
+	/** Lock for serializing page invalidation and atomic_o_trunc open */
+	struct mutex atomic_trunc_mutex;
+
 #ifdef CONFIG_FUSE_DAX
 	/*
 	 * Dax specific inode data
@@ -1315,4 +1318,8 @@  struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 		       unsigned int open_flags, fl_owner_t id, bool isdir);
 
+int fuse_invalidate_inode_pages(struct inode *inode);
+int fuse_invalidate_inode_pages_range(struct inode *inode,
+				      pgoff_t start, pgoff_t end);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ee846ce371d8..997c620f25df 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -86,6 +86,7 @@  static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->state = 0;
 	mutex_init(&fi->mutex);
 	spin_lock_init(&fi->lock);
+	mutex_init(&fi->atomic_trunc_mutex);
 	fi->forget = fuse_alloc_forget();
 	if (!fi->forget)
 		goto out_free;
@@ -107,6 +108,7 @@  static void fuse_free_inode(struct inode *inode)
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	mutex_destroy(&fi->mutex);
+	mutex_destroy(&fi->atomic_trunc_mutex);
 	kfree(fi->forget);
 #ifdef CONFIG_FUSE_DAX
 	kfree(fi->dax);
@@ -299,7 +301,7 @@  void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		}
 
 		if (inval)
-			invalidate_inode_pages2(inode->i_mapping);
+			fuse_invalidate_inode_pages(inode);
 	}
 
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
@@ -448,8 +450,7 @@  int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
 			pg_end = -1;
 		else
 			pg_end = (offset + len - 1) >> PAGE_SHIFT;
-		invalidate_inode_pages2_range(inode->i_mapping,
-					      pg_start, pg_end);
+		fuse_invalidate_inode_pages_range(inode, pg_start, pg_end);
 	}
 	iput(inode);
 	return 0;