diff mbox series

[v2,3/3] fs: use do_splice_direct() for nfsd/ksmbd server-side-copy

Message ID 20231130141624.3338942-4-amir73il@gmail.com (mailing list archive)
State New
Headers show
Series Avert possible deadlock with splice() and fanotify | expand

Commit Message

Amir Goldstein Nov. 30, 2023, 2:16 p.m. UTC
nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
perform kernel copy between two files on any two filesystems.

Splicing input file, while holding file_start_write() on the output file
which is on a different sb, posses a risk for fanotify related deadlocks.

We only need to call splice_file_range() from within the context of
->copy_file_range() filesystem methods with file_start_write() held.

To avoid the possible deadlocks, always use do_splice_direct() instead of
splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
without holding file_start_write().

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/read_write.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

Comments

Jan Kara Nov. 30, 2023, 4:49 p.m. UTC | #1
On Thu 30-11-23 16:16:24, Amir Goldstein wrote:
> nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
> perform kernel copy between two files on any two filesystems.
> 
> Splicing input file, while holding file_start_write() on the output file
> which is on a different sb, posses a risk for fanotify related deadlocks.
> 
> We only need to call splice_file_range() from within the context of
> ->copy_file_range() filesystem methods with file_start_write() held.
> 
> To avoid the possible deadlocks, always use do_splice_direct() instead of
> splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
> without holding file_start_write().
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

Looks good to me. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/read_write.c | 36 +++++++++++++++++++++++-------------
>  1 file changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 0bc99f38e623..e0c2c1b5962b 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1421,6 +1421,10 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>  				struct file *file_out, loff_t pos_out,
>  				size_t len, unsigned int flags)
>  {
> +	/* May only be called from within ->copy_file_range() methods */
> +	if (WARN_ON_ONCE(flags))
> +		return -EINVAL;
> +
>  	return splice_file_range(file_in, &pos_in, file_out, &pos_out,
>  				 min_t(size_t, len, MAX_RW_COUNT));
>  }
> @@ -1541,19 +1545,22 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  		ret = file_out->f_op->copy_file_range(file_in, pos_in,
>  						      file_out, pos_out,
>  						      len, flags);
> -		goto done;
> -	}
> -
> -	if (!splice && file_in->f_op->remap_file_range &&
> -	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
> +	} else if (!splice && file_in->f_op->remap_file_range &&
> +		   file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
>  		ret = file_in->f_op->remap_file_range(file_in, pos_in,
>  				file_out, pos_out,
>  				min_t(loff_t, MAX_RW_COUNT, len),
>  				REMAP_FILE_CAN_SHORTEN);
> -		if (ret > 0)
> -			goto done;
> +		/* fallback to splice */
> +		if (ret <= 0)
> +			splice = true;
>  	}
>  
> +	file_end_write(file_out);
> +
> +	if (!splice)
> +		goto done;
> +
>  	/*
>  	 * We can get here for same sb copy of filesystems that do not implement
>  	 * ->copy_file_range() in case filesystem does not support clone or in
> @@ -1565,11 +1572,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	 * and which filesystems do not, that will allow userspace tools to
>  	 * make consistent desicions w.r.t using copy_file_range().
>  	 *
> -	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
> +	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
> +	 * for server-side-copy between any two sb.
> +	 *
> +	 * In any case, we call do_splice_direct() and not splice_file_range(),
> +	 * without file_start_write() held, to avoid possible deadlocks related
> +	 * to splicing from input file, while file_start_write() is held on
> +	 * the output file on a different sb.
>  	 */
> -	ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -				      flags);
> -
> +	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +			       min_t(size_t, len, MAX_RW_COUNT), 0);
>  done:
>  	if (ret > 0) {
>  		fsnotify_access(file_in);
> @@ -1581,8 +1593,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  	inc_syscr(current);
>  	inc_syscw(current);
>  
> -	file_end_write(file_out);
> -
>  	return ret;
>  }
>  EXPORT_SYMBOL(vfs_copy_file_range);
> -- 
> 2.34.1
>
Christoph Hellwig Dec. 4, 2023, 8:39 a.m. UTC | #2
On Thu, Nov 30, 2023 at 04:16:24PM +0200, Amir Goldstein wrote:
> nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
> perform kernel copy between two files on any two filesystems.
> 
> Splicing input file, while holding file_start_write() on the output file
> which is on a different sb, posses a risk for fanotify related deadlocks.
> 
> We only need to call splice_file_range() from within the context of
> ->copy_file_range() filesystem methods with file_start_write() held.
> 
> To avoid the possible deadlocks, always use do_splice_direct() instead of
> splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
> without holding file_start_write().

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

(although I wish do_splice_direct had a better name like
vfs_splice_direct, espcially before growing more users)
Amir Goldstein Dec. 4, 2023, 1:19 p.m. UTC | #3
On Mon, Dec 4, 2023 at 10:39 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Thu, Nov 30, 2023 at 04:16:24PM +0200, Amir Goldstein wrote:
> > nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
> > perform kernel copy between two files on any two filesystems.
> >
> > Splicing input file, while holding file_start_write() on the output file
> > which is on a different sb, posses a risk for fanotify related deadlocks.
> >
> > We only need to call splice_file_range() from within the context of
> > ->copy_file_range() filesystem methods with file_start_write() held.
> >
> > To avoid the possible deadlocks, always use do_splice_direct() instead of
> > splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
> > without holding file_start_write().
>
> Looks good:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
>
> (although I wish do_splice_direct had a better name like
> vfs_splice_direct, espcially before growing more users)

I tried very hard in this series to add a little bit of consistency
for function names and indication of what it may be responsible for.

After this cleanup series, many of the file permission hooks and
moved from do_XXX() helpers to vfs_XXX() helpers, so I cannot in
good conscience rename do_splice_direct(), which does not have
file permission hooks to vfs_splice_direct().

I can rename it to splice_direct() as several other splice_XXX()
exported helpers in this file.

ok?

Thanks,
Amir.
Christoph Hellwig Dec. 4, 2023, 2:02 p.m. UTC | #4
On Mon, Dec 04, 2023 at 03:19:26PM +0200, Amir Goldstein wrote:
> I tried very hard in this series to add a little bit of consistency
> for function names and indication of what it may be responsible for.
> 
> After this cleanup series, many of the file permission hooks and
> moved from do_XXX() helpers to vfs_XXX() helpers, so I cannot in
> good conscience rename do_splice_direct(), which does not have
> file permission hooks to vfs_splice_direct().
> 
> I can rename it to splice_direct() as several other splice_XXX()
> exported helpers in this file.

Let's keep the name for now.  do_ prefixes are not great, especially
for exported functions, but no prefix at all isn't great either.
So let's get your work done and then we can look into introducing
a consistent naming scheme eventually.
diff mbox series

Patch

diff --git a/fs/read_write.c b/fs/read_write.c
index 0bc99f38e623..e0c2c1b5962b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1421,6 +1421,10 @@  ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				size_t len, unsigned int flags)
 {
+	/* May only be called from within ->copy_file_range() methods */
+	if (WARN_ON_ONCE(flags))
+		return -EINVAL;
+
 	return splice_file_range(file_in, &pos_in, file_out, &pos_out,
 				 min_t(size_t, len, MAX_RW_COUNT));
 }
@@ -1541,19 +1545,22 @@  ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 		ret = file_out->f_op->copy_file_range(file_in, pos_in,
 						      file_out, pos_out,
 						      len, flags);
-		goto done;
-	}
-
-	if (!splice && file_in->f_op->remap_file_range &&
-	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+	} else if (!splice && file_in->f_op->remap_file_range &&
+		   file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
 		ret = file_in->f_op->remap_file_range(file_in, pos_in,
 				file_out, pos_out,
 				min_t(loff_t, MAX_RW_COUNT, len),
 				REMAP_FILE_CAN_SHORTEN);
-		if (ret > 0)
-			goto done;
+		/* fallback to splice */
+		if (ret <= 0)
+			splice = true;
 	}
 
+	file_end_write(file_out);
+
+	if (!splice)
+		goto done;
+
 	/*
 	 * We can get here for same sb copy of filesystems that do not implement
 	 * ->copy_file_range() in case filesystem does not support clone or in
@@ -1565,11 +1572,16 @@  ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * and which filesystems do not, that will allow userspace tools to
 	 * make consistent desicions w.r.t using copy_file_range().
 	 *
-	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
+	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
+	 * for server-side-copy between any two sb.
+	 *
+	 * In any case, we call do_splice_direct() and not splice_file_range(),
+	 * without file_start_write() held, to avoid possible deadlocks related
+	 * to splicing from input file, while file_start_write() is held on
+	 * the output file on a different sb.
 	 */
-	ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				      flags);
-
+	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+			       min_t(size_t, len, MAX_RW_COUNT), 0);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
@@ -1581,8 +1593,6 @@  ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	inc_syscr(current);
 	inc_syscw(current);
 
-	file_end_write(file_out);
-
 	return ret;
 }
 EXPORT_SYMBOL(vfs_copy_file_range);