[03/11] vfs: no fallback for ->copy_file_range
diff mbox series

Message ID 20181203083416.28978-4-david@fromorbit.com
State New
Headers show
Series
  • fs: fixes for major copy_file_range() issues
Related show

Commit Message

Dave Chinner Dec. 3, 2018, 8:34 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now that we have generic_copy_file_range(), remove it as a fallback
case when offloads fail. This puts the responsibility for executing
fallbacks on the filesystems that implement ->copy_file_range and
allows us to add operational validity checks to
generic_copy_file_range().

Rework vfs_copy_file_range() to call a new do_copy_file_range()
helper to exceute the copying callout, and move calls to
generic_file_copy_range() into filesystem methods where they
currently return failures.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/ceph/file.c      | 17 ++++++++++++++++-
 fs/cifs/cifsfs.c    |  4 ++++
 fs/fuse/file.c      | 17 ++++++++++++++++-
 fs/nfs/nfs4file.c   |  4 ++++
 fs/overlayfs/file.c |  9 ++++++++-
 fs/read_write.c     | 24 +++++++++++++++---------
 6 files changed, 63 insertions(+), 12 deletions(-)

Comments

Amir Goldstein Dec. 3, 2018, 10:22 a.m. UTC | #1
On Mon, Dec 3, 2018 at 10:34 AM Dave Chinner <david@fromorbit.com> wrote:
>
> From: Dave Chinner <dchinner@redhat.com>
>
> Now that we have generic_copy_file_range(), remove it as a fallback
> case when offloads fail. This puts the responsibility for executing
> fallbacks on the filesystems that implement ->copy_file_range and
> allows us to add operational validity checks to
> generic_copy_file_range().
>
> Rework vfs_copy_file_range() to call a new do_copy_file_range()
> helper to exceute the copying callout, and move calls to
> generic_file_copy_range() into filesystem methods where they
> currently return failures.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

You may add
Reviewed-by: Amir Goldstein <amir73il@gmail.com>

After fixing the overlayfs issue below.
...

> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 84dd957efa24..68736e5d6a56 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -486,8 +486,15 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
>                                    struct file *file_out, loff_t pos_out,
>                                    size_t len, unsigned int flags)
>  {
> -       return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> +       ssize_t ret;
> +
> +       ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
>                             OVL_COPY);
> +
> +       if (ret == -EOPNOTSUPP)
> +               ret = generic_copy_file_range(file_in, pos_in, file_out,
> +                                       pos_out, len, flags);
> +       return ret;
>  }
>

This is unneeded, because ovl_copyfile(OVL_COPY) is implemented
by calling vfs_copy_file_range() (on the underlying files) and it is
not possible
to get EOPNOTSUPP from vfs_copy_file_range().

Thanks,
Amir.
Anna Schumaker Dec. 3, 2018, 6:23 p.m. UTC | #2
Hi Dave,

On Mon, 2018-12-03 at 19:34 +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that we have generic_copy_file_range(), remove it as a fallback
> case when offloads fail. This puts the responsibility for executing
> fallbacks on the filesystems that implement ->copy_file_range and
> allows us to add operational validity checks to
> generic_copy_file_range().
> 
> Rework vfs_copy_file_range() to call a new do_copy_file_range()
> helper to exceute the copying callout, and move calls to
> generic_file_copy_range() into filesystem methods where they
> currently return failures.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/ceph/file.c      | 17 ++++++++++++++++-
>  fs/cifs/cifsfs.c    |  4 ++++
>  fs/fuse/file.c      | 17 ++++++++++++++++-
>  fs/nfs/nfs4file.c   |  4 ++++

The NFS bits look okay to me:
Acked-by: Anna Schumaker <Anna.Schumaker@Netapp.com

>  fs/overlayfs/file.c |  9 ++++++++-
>  fs/read_write.c     | 24 +++++++++++++++---------
>  6 files changed, 63 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 189df668b6a0..cf29f0410dcb 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1885,7 +1885,7 @@ static int is_file_size_ok(struct inode *src_inode,
> struct inode *dst_inode,
>  	return 0;
>  }
>  
> -static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
> +static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  				    struct file *dst_file, loff_t dst_off,
>  				    size_t len, unsigned int flags)
>  {
> @@ -2096,6 +2096,21 @@ static ssize_t ceph_copy_file_range(struct file
> *src_file, loff_t src_off,
>  	return ret;
>  }
>  
> +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
> +				    struct file *dst_file, loff_t dst_off,
> +				    size_t len, unsigned int flags)
> +{
> +	ssize_t ret;
> +
> +	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
> +					len, flags);
> +
> +	if (ret == -EOPNOTSUPP)
> +		ret = generic_copy_file_range(src_file, src_off, dst_file,
> +					dst_off, len, flags);
> +	return ret;
> +}
> +
>  const struct file_operations ceph_file_fops = {
>  	.open = ceph_open,
>  	.release = ceph_release,
> diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
> index 865706edb307..5ef4baec6234 100644
> --- a/fs/cifs/cifsfs.c
> +++ b/fs/cifs/cifsfs.c
> @@ -1141,6 +1141,10 @@ static ssize_t cifs_copy_file_range(struct file
> *src_file, loff_t off,
>  	rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
>  					len, flags);
>  	free_xid(xid);
> +
> +	if (rc == -EOPNOTSUPP)
> +		rc = generic_copy_file_range(src_file, off, dst_file,
> +					destoff, len, flags);
>  	return rc;
>  }
>  
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index b52f9baaa3e7..b86fb0298739 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -3024,7 +3024,7 @@ static long fuse_file_fallocate(struct file *file, int
> mode, loff_t offset,
>  	return err;
>  }
>  
> -static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
> +static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
>  				    struct file *file_out, loff_t pos_out,
>  				    size_t len, unsigned int flags)
>  {
> @@ -3100,6 +3100,21 @@ static ssize_t fuse_copy_file_range(struct file
> *file_in, loff_t pos_in,
>  	return err;
>  }
>  
> +static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
> +				    struct file *dst_file, loff_t dst_off,
> +				    size_t len, unsigned int flags)
> +{
> +	ssize_t ret;
> +
> +	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
> +					len, flags);
> +
> +	if (ret == -EOPNOTSUPP)
> +		ret = generic_copy_file_range(src_file, src_off, dst_file,
> +					dst_off, len, flags);
> +	return ret;
> +}
> +
>  static const struct file_operations fuse_file_operations = {
>  	.llseek		= fuse_file_llseek,
>  	.read_iter	= fuse_file_read_iter,
> diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
> index 46d691ba04bc..d7766a6eb0f4 100644
> --- a/fs/nfs/nfs4file.c
> +++ b/fs/nfs/nfs4file.c
> @@ -141,6 +141,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in,
> loff_t pos_in,
>  	ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
>  	if (ret == -EAGAIN)
>  		goto retry;
> +
> +	if (ret == -EOPNOTSUPP)
> +		ret = generic_copy_file_range(file_in, pos_in, file_out,
> +					pos_out, count, flags);
>  	return ret;
>  }
>  
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 84dd957efa24..68736e5d6a56 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -486,8 +486,15 @@ static ssize_t ovl_copy_file_range(struct file *file_in,
> loff_t pos_in,
>  				   struct file *file_out, loff_t pos_out,
>  				   size_t len, unsigned int flags)
>  {
> -	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> +	ssize_t ret;
> +
> +	ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
>  			    OVL_COPY);
> +
> +	if (ret == -EOPNOTSUPP)
> +		ret = generic_copy_file_range(file_in, pos_in, file_out,
> +					pos_out, len, flags);
> +	return ret;
>  }
>  
>  static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 50114694c98b..44339b44accc 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1570,6 +1570,18 @@ ssize_t generic_copy_file_range(struct file *file_in,
> loff_t pos_in,
>  }
>  EXPORT_SYMBOL(generic_copy_file_range);
>  
> +static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> +			    struct file *file_out, loff_t pos_out,
> +			    size_t len, unsigned int flags)
> +{
> +	if (file_out->f_op->copy_file_range)
> +		return file_out->f_op->copy_file_range(file_in, pos_in,
> file_out,
> +						      pos_out, len, flags);
> +
> +	return generic_copy_file_range(file_in, &pos_in, file_out, &pos_out,
> +					len, flags);
> +}
> +
>  /*
>   * copy_file_range() differs from regular file read and write in that it
>   * specifically allows return partial success.  When it does so is up to
> @@ -1634,15 +1646,9 @@ ssize_t vfs_copy_file_range(struct file *file_in,
> loff_t pos_in,
>  		}
>  	}
>  
> -	if (file_out->f_op->copy_file_range) {
> -		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
> -						      pos_out, len, flags);
> -		if (ret != -EOPNOTSUPP)
> -			goto done;
> -	}
> -
> -	ret = generic_copy_file_range(file_in, &pos_in, file_out, &pos_out,
> -					len, flags);
> +	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> +				flags);
> +	WARN_ON_ONCE(ret == -EOPNOTSUPP);
>  done:
>  	if (ret > 0) {
>  		fsnotify_access(file_in);
Dave Chinner Dec. 3, 2018, 11:02 p.m. UTC | #3
On Mon, Dec 03, 2018 at 12:22:21PM +0200, Amir Goldstein wrote:
> On Mon, Dec 3, 2018 at 10:34 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > From: Dave Chinner <dchinner@redhat.com>
> >
> > Now that we have generic_copy_file_range(), remove it as a fallback
> > case when offloads fail. This puts the responsibility for executing
> > fallbacks on the filesystems that implement ->copy_file_range and
> > allows us to add operational validity checks to
> > generic_copy_file_range().
> >
> > Rework vfs_copy_file_range() to call a new do_copy_file_range()
> > helper to exceute the copying callout, and move calls to
> > generic_file_copy_range() into filesystem methods where they
> > currently return failures.
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> 
> You may add
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> 
> After fixing the overlayfs issue below.
> ...
> 
> > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > index 84dd957efa24..68736e5d6a56 100644
> > --- a/fs/overlayfs/file.c
> > +++ b/fs/overlayfs/file.c
> > @@ -486,8 +486,15 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
> >                                    struct file *file_out, loff_t pos_out,
> >                                    size_t len, unsigned int flags)
> >  {
> > -       return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > +       ssize_t ret;
> > +
> > +       ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> >                             OVL_COPY);
> > +
> > +       if (ret == -EOPNOTSUPP)
> > +               ret = generic_copy_file_range(file_in, pos_in, file_out,
> > +                                       pos_out, len, flags);
> > +       return ret;
> >  }
> >
> 
> This is unneeded, because ovl_copyfile(OVL_COPY) is implemented
> by calling vfs_copy_file_range() (on the underlying files) and it is
> not possible
> to get EOPNOTSUPP from vfs_copy_file_range().

Except that it is possible. e.g. If the underlying filesystem tries
a copy offload, gets a "not supported" failure from the remote
server and then doesn't implement a fallback.

Cheers,

Dave.
Christoph Hellwig Dec. 4, 2018, 3:16 p.m. UTC | #4
Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>
Amir Goldstein Dec. 6, 2018, 4:16 a.m. UTC | #5
On Tue, Dec 4, 2018 at 1:02 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Mon, Dec 03, 2018 at 12:22:21PM +0200, Amir Goldstein wrote:
> > On Mon, Dec 3, 2018 at 10:34 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > From: Dave Chinner <dchinner@redhat.com>
> > >
> > > Now that we have generic_copy_file_range(), remove it as a fallback
> > > case when offloads fail. This puts the responsibility for executing
> > > fallbacks on the filesystems that implement ->copy_file_range and
> > > allows us to add operational validity checks to
> > > generic_copy_file_range().
> > >
> > > Rework vfs_copy_file_range() to call a new do_copy_file_range()
> > > helper to exceute the copying callout, and move calls to
> > > generic_file_copy_range() into filesystem methods where they
> > > currently return failures.
> > >
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> >
> > You may add
> > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> >
> > After fixing the overlayfs issue below.
> > ...
> >
> > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > > index 84dd957efa24..68736e5d6a56 100644
> > > --- a/fs/overlayfs/file.c
> > > +++ b/fs/overlayfs/file.c
> > > @@ -486,8 +486,15 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
> > >                                    struct file *file_out, loff_t pos_out,
> > >                                    size_t len, unsigned int flags)
> > >  {
> > > -       return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > > +       ssize_t ret;
> > > +
> > > +       ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > >                             OVL_COPY);
> > > +
> > > +       if (ret == -EOPNOTSUPP)
> > > +               ret = generic_copy_file_range(file_in, pos_in, file_out,
> > > +                                       pos_out, len, flags);
> > > +       return ret;
> > >  }
> > >
> >
> > This is unneeded, because ovl_copyfile(OVL_COPY) is implemented
> > by calling vfs_copy_file_range() (on the underlying files) and it is
> > not possible
> > to get EOPNOTSUPP from vfs_copy_file_range().
>
> Except that it is possible. e.g. If the underlying filesystem tries
> a copy offload, gets a "not supported" failure from the remote
> server and then doesn't implement a fallback.
>

I'm in the opinion that ovl_copy_file_range() and do_copy_file_range()
are a like. If you choose not to fallback in the latter to
generic_copy_file_range() for misbehaving filesystem and WARN_ON
this case, there is no reason for overlayfs to cover up for the
misbehaving underlying filesystem.

If you want to cover up for misbehaving filesystem, please do it
in do_copy_file_range() and drop the WARN_ON_ONCE().
Come to think about it, I understand your reasoning for pushing
generic_copy_file_range() down to filesystems so they can fallback to
it in several error conditions.
I do not follow the reasoning of NOT falling back to
generic_copy_file_range() in vfs if EOPNOTSUPP is returned from
filesystem. IOW, if we want to cover up for misbehaving filesystem,
this would have been a more robust code:

+static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
+                           struct file *file_out, loff_t pos_out,
+                           size_t len, unsigned int flags)
+{
+       ssize_t ret;
+
+       if (file_out->f_op->copy_file_range) {
+               ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+                                                     pos_out, len, flags);
+               if (!WARN_ON_ONCE(ret == -EOPNOTSUPP))
+                       return ret;
+       }
+       return generic_copy_file_range(file_in, &pos_in, file_out, &pos_out,
+                                       len, flags);
+}
+

Thanks,
Amir.
Dave Chinner Dec. 6, 2018, 9:30 p.m. UTC | #6
On Thu, Dec 06, 2018 at 06:16:46AM +0200, Amir Goldstein wrote:
> On Tue, Dec 4, 2018 at 1:02 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Mon, Dec 03, 2018 at 12:22:21PM +0200, Amir Goldstein wrote:
> > > On Mon, Dec 3, 2018 at 10:34 AM Dave Chinner <david@fromorbit.com> wrote:
> > > >
> > > > From: Dave Chinner <dchinner@redhat.com>
> > > >
> > > > Now that we have generic_copy_file_range(), remove it as a fallback
> > > > case when offloads fail. This puts the responsibility for executing
> > > > fallbacks on the filesystems that implement ->copy_file_range and
> > > > allows us to add operational validity checks to
> > > > generic_copy_file_range().
> > > >
> > > > Rework vfs_copy_file_range() to call a new do_copy_file_range()
> > > > helper to exceute the copying callout, and move calls to
> > > > generic_file_copy_range() into filesystem methods where they
> > > > currently return failures.
> > > >
> > > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > >
> > > You may add
> > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> > >
> > > After fixing the overlayfs issue below.
> > > ...
> > >
> > > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > > > index 84dd957efa24..68736e5d6a56 100644
> > > > --- a/fs/overlayfs/file.c
> > > > +++ b/fs/overlayfs/file.c
> > > > @@ -486,8 +486,15 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
> > > >                                    struct file *file_out, loff_t pos_out,
> > > >                                    size_t len, unsigned int flags)
> > > >  {
> > > > -       return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > > > +       ssize_t ret;
> > > > +
> > > > +       ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > > >                             OVL_COPY);
> > > > +
> > > > +       if (ret == -EOPNOTSUPP)
> > > > +               ret = generic_copy_file_range(file_in, pos_in, file_out,
> > > > +                                       pos_out, len, flags);
> > > > +       return ret;
> > > >  }
> > > >
> > >
> > > This is unneeded, because ovl_copyfile(OVL_COPY) is implemented
> > > by calling vfs_copy_file_range() (on the underlying files) and it is
> > > not possible
> > > to get EOPNOTSUPP from vfs_copy_file_range().
> >
> > Except that it is possible. e.g. If the underlying filesystem tries
> > a copy offload, gets a "not supported" failure from the remote
> > server and then doesn't implement a fallback.
> >
> 
> I'm in the opinion that ovl_copy_file_range() and do_copy_file_range()
> are a like. If you choose not to fallback in the latter to
> generic_copy_file_range() for misbehaving filesystem and WARN_ON
> this case, there is no reason for overlayfs to cover up for the
> misbehaving underlying filesystem.
> 
> If you want to cover up for misbehaving filesystem, please do it
> in do_copy_file_range() and drop the WARN_ON_ONCE().
> Come to think about it, I understand your reasoning for pushing
> generic_copy_file_range() down to filesystems so they can fallback to
> it in several error conditions.
> I do not follow the reasoning of NOT falling back to
> generic_copy_file_range() in vfs if EOPNOTSUPP is returned from
> filesystem. IOW, if we want to cover up for misbehaving filesystem,
> this would have been a more robust code:

Since when have we defined a filesystem returning -EOPNOTSUPP as a
"misbehaving filesystem"? Userspace has to handle errors in
copy_file_range() with it's own fallback copy code (i.e. it cannot
rely on the kernel actually supporting copy_file_range at all).
Hence it's perfectly fine for a filesystem implementation to encode
"offload or fail entirely" semantics if they want.

Yes, I've been shouted at by developers quite recently who
*demanded* that copy_file_range (and other offloads like
fallocate(ZERO_RANGE)) *fail* if they cannot "offload" the operation
to make it "fast". The application developers want to use different
algorithms if the kernel offload isn't any faster than userspace
doing the dumb thing and phsyically pushing bytes around itself.

I've pushed back on this as much as I can, but it doesn't change the
fact that for many situations doing do_splice_direct() is exactly
the wrong thing to do (e.g. because copy_file_range() on a TB+ scale
file couldn't be offloaded by the filesystem because the server said
EOPNOTSUPP)

IOWs, for some filesystems or situations where it makes sense to
have fail-fast semantics and leave the decision of what to do next
in the hands of the userspace application that has the context
necessary to determine what the best action to take is.  And to do
that, we need to give control of the fallback to the filesystems.

Flexibility is what is needed here, not a dumb, hard coded "the VFS
always know what's right for you" policy that triggers when nobody
really wants it to.

Cheers,

Dave.
Amir Goldstein Dec. 7, 2018, 5:38 a.m. UTC | #7
On Thu, Dec 6, 2018 at 11:31 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Thu, Dec 06, 2018 at 06:16:46AM +0200, Amir Goldstein wrote:
> > On Tue, Dec 4, 2018 at 1:02 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Mon, Dec 03, 2018 at 12:22:21PM +0200, Amir Goldstein wrote:
> > > > On Mon, Dec 3, 2018 at 10:34 AM Dave Chinner <david@fromorbit.com> wrote:
> > > > >
> > > > > From: Dave Chinner <dchinner@redhat.com>
> > > > >
> > > > > Now that we have generic_copy_file_range(), remove it as a fallback
> > > > > case when offloads fail. This puts the responsibility for executing
> > > > > fallbacks on the filesystems that implement ->copy_file_range and
> > > > > allows us to add operational validity checks to
> > > > > generic_copy_file_range().
> > > > >
> > > > > Rework vfs_copy_file_range() to call a new do_copy_file_range()
> > > > > helper to exceute the copying callout, and move calls to
> > > > > generic_file_copy_range() into filesystem methods where they
> > > > > currently return failures.
> > > > >
> > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > >
> > > > You may add
> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> > > >
> > > > After fixing the overlayfs issue below.
> > > > ...
> > > >
> > > > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > > > > index 84dd957efa24..68736e5d6a56 100644
> > > > > --- a/fs/overlayfs/file.c
> > > > > +++ b/fs/overlayfs/file.c
> > > > > @@ -486,8 +486,15 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > >                                    struct file *file_out, loff_t pos_out,
> > > > >                                    size_t len, unsigned int flags)
> > > > >  {
> > > > > -       return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > > > > +       ssize_t ret;
> > > > > +
> > > > > +       ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > > > >                             OVL_COPY);
> > > > > +
> > > > > +       if (ret == -EOPNOTSUPP)
> > > > > +               ret = generic_copy_file_range(file_in, pos_in, file_out,
> > > > > +                                       pos_out, len, flags);
> > > > > +       return ret;
> > > > >  }
> > > > >
> > > >
> > > > This is unneeded, because ovl_copyfile(OVL_COPY) is implemented
> > > > by calling vfs_copy_file_range() (on the underlying files) and it is
> > > > not possible
> > > > to get EOPNOTSUPP from vfs_copy_file_range().
> > >
> > > Except that it is possible. e.g. If the underlying filesystem tries
> > > a copy offload, gets a "not supported" failure from the remote
> > > server and then doesn't implement a fallback.
> > >
> >
> > I'm in the opinion that ovl_copy_file_range() and do_copy_file_range()
> > are a like. If you choose not to fallback in the latter to
> > generic_copy_file_range() for misbehaving filesystem and WARN_ON
> > this case, there is no reason for overlayfs to cover up for the
> > misbehaving underlying filesystem.
> >
> > If you want to cover up for misbehaving filesystem, please do it
> > in do_copy_file_range() and drop the WARN_ON_ONCE().
> > Come to think about it, I understand your reasoning for pushing
> > generic_copy_file_range() down to filesystems so they can fallback to
> > it in several error conditions.
> > I do not follow the reasoning of NOT falling back to
> > generic_copy_file_range() in vfs if EOPNOTSUPP is returned from
> > filesystem. IOW, if we want to cover up for misbehaving filesystem,
> > this would have been a more robust code:
>
> Since when have we defined a filesystem returning -EOPNOTSUPP as a
> "misbehaving filesystem"?

Since you wrote:

WARN_ON_ONCE(ret == -EOPNOTSUPP);

If filesystem is allowed to return EOPNOTSUPP from ->copy_file_range()
then what is this warning about?

> Userspace has to handle errors in
> copy_file_range() with it's own fallback copy code (i.e. it cannot
> rely on the kernel actually supporting copy_file_range at all).
> Hence it's perfectly fine for a filesystem implementation to encode
> "offload or fail entirely" semantics if they want.
>
> Yes, I've been shouted at by developers quite recently who
> *demanded* that copy_file_range (and other offloads like
> fallocate(ZERO_RANGE)) *fail* if they cannot "offload" the operation
> to make it "fast". The application developers want to use different
> algorithms if the kernel offload isn't any faster than userspace
> doing the dumb thing and phsyically pushing bytes around itself.
>
> I've pushed back on this as much as I can, but it doesn't change the
> fact that for many situations doing do_splice_direct() is exactly
> the wrong thing to do (e.g. because copy_file_range() on a TB+ scale
> file couldn't be offloaded by the filesystem because the server said
> EOPNOTSUPP)
>
> IOWs, for some filesystems or situations where it makes sense to
> have fail-fast semantics and leave the decision of what to do next
> in the hands of the userspace application that has the context
> necessary to determine what the best action to take is.  And to do
> that, we need to give control of the fallback to the filesystems.
>
> Flexibility is what is needed here, not a dumb, hard coded "the VFS
> always know what's right for you" policy that triggers when nobody
> really wants it to.
>

You misunderstood me.
Please remove the fallback to generic_copy_file_range() in
ovl_copy_file_range() as I requested in initial review for the exact
same reasons that you list above.

The overlayfs implementation of ovl_copy_file_range() is just
handing over the call to underlying vfs_copy_file_range().
If the latter is expected to return EOPNOTSUPP, so does the
overlayfs implementation.

Thanks,
Amir.

Patch
diff mbox series

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 189df668b6a0..cf29f0410dcb 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1885,7 +1885,7 @@  static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
 	return 0;
 }
 
-static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 				    struct file *dst_file, loff_t dst_off,
 				    size_t len, unsigned int flags)
 {
@@ -2096,6 +2096,21 @@  static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	return ret;
 }
 
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+				    struct file *dst_file, loff_t dst_off,
+				    size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
+					len, flags);
+
+	if (ret == -EOPNOTSUPP)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+					dst_off, len, flags);
+	return ret;
+}
+
 const struct file_operations ceph_file_fops = {
 	.open = ceph_open,
 	.release = ceph_release,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 865706edb307..5ef4baec6234 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1141,6 +1141,10 @@  static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
 	rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
 					len, flags);
 	free_xid(xid);
+
+	if (rc == -EOPNOTSUPP)
+		rc = generic_copy_file_range(src_file, off, dst_file,
+					destoff, len, flags);
 	return rc;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b52f9baaa3e7..b86fb0298739 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3024,7 +3024,7 @@  static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	return err;
 }
 
-static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    size_t len, unsigned int flags)
 {
@@ -3100,6 +3100,21 @@  static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 	return err;
 }
 
+static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
+				    struct file *dst_file, loff_t dst_off,
+				    size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
+					len, flags);
+
+	if (ret == -EOPNOTSUPP)
+		ret = generic_copy_file_range(src_file, src_off, dst_file,
+					dst_off, len, flags);
+	return ret;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read_iter	= fuse_file_read_iter,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 46d691ba04bc..d7766a6eb0f4 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -141,6 +141,10 @@  static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 	ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
 	if (ret == -EAGAIN)
 		goto retry;
+
+	if (ret == -EOPNOTSUPP)
+		ret = generic_copy_file_range(file_in, pos_in, file_out,
+					pos_out, count, flags);
 	return ret;
 }
 
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 84dd957efa24..68736e5d6a56 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -486,8 +486,15 @@  static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
 				   struct file *file_out, loff_t pos_out,
 				   size_t len, unsigned int flags)
 {
-	return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
+	ssize_t ret;
+
+	ret =  ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
 			    OVL_COPY);
+
+	if (ret == -EOPNOTSUPP)
+		ret = generic_copy_file_range(file_in, pos_in, file_out,
+					pos_out, len, flags);
+	return ret;
 }
 
 static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/read_write.c b/fs/read_write.c
index 50114694c98b..44339b44accc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1570,6 +1570,18 @@  ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
+static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
+			    struct file *file_out, loff_t pos_out,
+			    size_t len, unsigned int flags)
+{
+	if (file_out->f_op->copy_file_range)
+		return file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+						      pos_out, len, flags);
+
+	return generic_copy_file_range(file_in, &pos_in, file_out, &pos_out,
+					len, flags);
+}
+
 /*
  * copy_file_range() differs from regular file read and write in that it
  * specifically allows return partial success.  When it does so is up to
@@ -1634,15 +1646,9 @@  ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 		}
 	}
 
-	if (file_out->f_op->copy_file_range) {
-		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
-						      pos_out, len, flags);
-		if (ret != -EOPNOTSUPP)
-			goto done;
-	}
-
-	ret = generic_copy_file_range(file_in, &pos_in, file_out, &pos_out,
-					len, flags);
+	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+				flags);
+	WARN_ON_ONCE(ret == -EOPNOTSUPP);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);