diff mbox series

[V8,2/3] fuse: Introduce synchronous read and write for passthrough

Message ID 20200911163403.79505-3-balsini@android.com (mailing list archive)
State New, archived
Headers show
Series fuse: Add support for passthrough read/write | expand

Commit Message

Alessio Balsini Sept. 11, 2020, 4:34 p.m. UTC
All the read and write operations performed on fuse_files which have the
passthrough feature enabled are forwarded to the associated lower file
system file.

Sending the request directly to the lower file system avoids the userspace
round-trip that, because of possible context switches and additional
operations might reduce the overall performance, especially in those cases
where caching doesn't help, for example in reads at random offsets.

If a fuse_file has a lower file system file associated for passthrough can
be verified by checking the validity of its passthrough_filp pointer, which
is not null only passthrough has been successfully enabled via the
appropriate ioctl(). When a read/write operation is requested for a FUSE
file with passthrough enabled, the request is directly forwarded to the
corresponding file_operations of the lower file system file. After the
read/write operation is completed, the file stats change is notified (and
propagated) to the lower file system.

This change only implements synchronous requests in passthrough, returning
an error in the case of ansynchronous operations, yet covering the majority
of the use cases.

Signed-off-by: Alessio Balsini <balsini@android.com>
---
 fs/fuse/file.c        |  8 +++--
 fs/fuse/fuse_i.h      |  2 ++
 fs/fuse/passthrough.c | 81 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 2 deletions(-)

Comments

Amir Goldstein Sept. 12, 2020, 9:55 a.m. UTC | #1
On Fri, Sep 11, 2020 at 7:34 PM Alessio Balsini <balsini@android.com> wrote:
>
> All the read and write operations performed on fuse_files which have the
> passthrough feature enabled are forwarded to the associated lower file
> system file.
>
> Sending the request directly to the lower file system avoids the userspace
> round-trip that, because of possible context switches and additional
> operations might reduce the overall performance, especially in those cases
> where caching doesn't help, for example in reads at random offsets.
>
> If a fuse_file has a lower file system file associated for passthrough can
> be verified by checking the validity of its passthrough_filp pointer, which
> is not null only passthrough has been successfully enabled via the
> appropriate ioctl(). When a read/write operation is requested for a FUSE
> file with passthrough enabled, the request is directly forwarded to the
> corresponding file_operations of the lower file system file. After the
> read/write operation is completed, the file stats change is notified (and
> propagated) to the lower file system.
>
> This change only implements synchronous requests in passthrough, returning
> an error in the case of ansynchronous operations, yet covering the majority
> of the use cases.
>
> Signed-off-by: Alessio Balsini <balsini@android.com>
> ---
>  fs/fuse/file.c        |  8 +++--
>  fs/fuse/fuse_i.h      |  2 ++
>  fs/fuse/passthrough.c | 81 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 89 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 6c0ec742ce74..c3289ff0cd33 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1552,7 +1552,9 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>         if (is_bad_inode(file_inode(file)))
>                 return -EIO;
>
> -       if (!(ff->open_flags & FOPEN_DIRECT_IO))
> +       if (ff->passthrough_filp)
> +               return fuse_passthrough_read_iter(iocb, to);
> +       else if (!(ff->open_flags & FOPEN_DIRECT_IO))
>                 return fuse_cache_read_iter(iocb, to);
>         else
>                 return fuse_direct_read_iter(iocb, to);
> @@ -1566,7 +1568,9 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         if (is_bad_inode(file_inode(file)))
>                 return -EIO;
>
> -       if (!(ff->open_flags & FOPEN_DIRECT_IO))
> +       if (ff->passthrough_filp)
> +               return fuse_passthrough_write_iter(iocb, from);
> +       else if (!(ff->open_flags & FOPEN_DIRECT_IO))
>                 return fuse_cache_write_iter(iocb, from);
>         else
>                 return fuse_direct_write_iter(iocb, from);
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 6c5166447905..21ba30a6a661 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -1106,5 +1106,7 @@ void fuse_free_conn(struct fuse_conn *fc);
>
>  int fuse_passthrough_setup(struct fuse_req *req, unsigned int fd);
>  void fuse_passthrough_release(struct fuse_file *ff);
> +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to);
> +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from);
>
>  #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
> index 86ab4eafa7bf..44a78e02f45d 100644
> --- a/fs/fuse/passthrough.c
> +++ b/fs/fuse/passthrough.c
> @@ -2,6 +2,87 @@
>
>  #include "fuse_i.h"
>
> +#include <linux/fs_stack.h>
> +#include <linux/fsnotify.h>
> +#include <linux/uio.h>
> +
> +static void fuse_copyattr(struct file *dst_file, struct file *src_file,
> +                         bool write)
> +{
> +       if (write) {
> +               struct inode *dst = file_inode(dst_file);
> +               struct inode *src = file_inode(src_file);
> +
> +               fsnotify_modify(src_file);
> +               fsstack_copy_inode_size(dst, src);
> +       } else {
> +               fsnotify_access(src_file);
> +       }
> +}
> +
> +
> +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse,
> +                                  struct iov_iter *iter)
> +{
> +       ssize_t ret;
> +       struct file *fuse_filp = iocb_fuse->ki_filp;
> +       struct fuse_file *ff = fuse_filp->private_data;
> +       struct file *passthrough_filp = ff->passthrough_filp;
> +
> +       if (!iov_iter_count(iter))
> +               return 0;
> +
> +       if (is_sync_kiocb(iocb_fuse)) {
> +               struct kiocb iocb;
> +
> +               kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
> +               ret = call_read_iter(passthrough_filp, &iocb, iter);
> +               iocb_fuse->ki_pos = iocb.ki_pos;
> +               if (ret >= 0)
> +                       fuse_copyattr(fuse_filp, passthrough_filp, false);
> +
> +       } else {
> +               ret = -EIO;
> +       }
> +
> +       return ret;
> +}
> +
> +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse,
> +                                   struct iov_iter *iter)
> +{
> +       ssize_t ret;
> +       struct file *fuse_filp = iocb_fuse->ki_filp;
> +       struct fuse_file *ff = fuse_filp->private_data;
> +       struct inode *fuse_inode = file_inode(fuse_filp);
> +       struct file *passthrough_filp = ff->passthrough_filp;
> +
> +       if (!iov_iter_count(iter))
> +               return 0;
> +
> +       inode_lock(fuse_inode);
> +
> +       if (is_sync_kiocb(iocb_fuse)) {
> +               struct kiocb iocb;
> +
> +               kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
> +
> +               file_start_write(passthrough_filp);
> +               ret = call_write_iter(passthrough_filp, &iocb, iter);

Why not vfs_iter_write()/vfs_iter_read()?

You are bypassing many internal VFS checks that seem pretty important.

Thanks,
Amir.
Alessio Balsini Sept. 21, 2020, 11:01 a.m. UTC | #2
Hi Amir,

On Sat, Sep 12, 2020 at 12:55:35PM +0300, Amir Goldstein wrote:
> On Fri, Sep 11, 2020 at 7:34 PM Alessio Balsini <balsini@android.com> wrote:
> > +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse,
> > +                                  struct iov_iter *iter)
> > +{
> > +       ssize_t ret;
> > +       struct file *fuse_filp = iocb_fuse->ki_filp;
> > +       struct fuse_file *ff = fuse_filp->private_data;
> > +       struct file *passthrough_filp = ff->passthrough_filp;
> > +
> > +       if (!iov_iter_count(iter))
> > +               return 0;
> > +
> > +       if (is_sync_kiocb(iocb_fuse)) {
> > +               struct kiocb iocb;
> > +
> > +               kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
> > +               ret = call_read_iter(passthrough_filp, &iocb, iter);
> > +               iocb_fuse->ki_pos = iocb.ki_pos;
> > +               if (ret >= 0)
> > +                       fuse_copyattr(fuse_filp, passthrough_filp, false);
> > +
> > +       } else {
> > +               ret = -EIO;
> > +       }
> > +
> > +       return ret;
> > +}
> > +
> > +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse,
> > +                                   struct iov_iter *iter)
> > +{
> > +       ssize_t ret;
> > +       struct file *fuse_filp = iocb_fuse->ki_filp;
> > +       struct fuse_file *ff = fuse_filp->private_data;
> > +       struct inode *fuse_inode = file_inode(fuse_filp);
> > +       struct file *passthrough_filp = ff->passthrough_filp;
> > +
> > +       if (!iov_iter_count(iter))
> > +               return 0;
> > +
> > +       inode_lock(fuse_inode);
> > +
> > +       if (is_sync_kiocb(iocb_fuse)) {
> > +               struct kiocb iocb;
> > +
> > +               kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
> > +
> > +               file_start_write(passthrough_filp);
> > +               ret = call_write_iter(passthrough_filp, &iocb, iter);
> 
> Why not vfs_iter_write()/vfs_iter_read()?
> 
> You are bypassing many internal VFS checks that seem pretty important.
> 

I've been thinking a lot about this and decided to go for the VFS bypassing
solution because:
1. it looked odd to me to perform VFS checks twice, both for FUSE and lower
   FS and it seemed to me that we found a tradeoff with Jann about doing
   this lower FS call, and
2. in our Android use case (I just saw you asking for more details about
   this in, I'll reply on the other thread), the user might have the right
   credentials to access the FUSE file system, but not to access the lower
   file system, so the VFS checkings would fail. So that would have created
   the need for a credential bypassing that looked hacky.

But I agree and I would probably sleep better knowing that VFS checks are
not skipped :) So I decided to implemented the vfs_iter_{read,write}()
variant.

I again picked a lot from the overlayfs solution. In a few words, I get the
FUSE daemon credential reference at FUSE FS creation time and, when
passthrough read/write operations are triggered, the kernel temporarily
overrides the requesting process' credentials with those of the FUSE
daemon. Credentials are reverted as soon as the operation completes.

I have a temporary development branch where I'm developing the V9 of this
patch, plus the VFS variant (git history may change):

https://github.com/balsini/linux/tree/fuse-passthrough-stable-v5.8-v9-vfs

For now, I'm happy to say that I like this VFS solution, it also simplified
the lower file system notifications, so I'll probably go with this in the
V9.

Thanks again Amir, much appreciated.
Alessio
Amir Goldstein Sept. 21, 2020, 1:07 p.m. UTC | #3
On Mon, Sep 21, 2020 at 2:01 PM Alessio Balsini <balsini@android.com> wrote:
>
> Hi Amir,
>
> On Sat, Sep 12, 2020 at 12:55:35PM +0300, Amir Goldstein wrote:
> > On Fri, Sep 11, 2020 at 7:34 PM Alessio Balsini <balsini@android.com> wrote:
> > > +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse,
> > > +                                  struct iov_iter *iter)
> > > +{
> > > +       ssize_t ret;
> > > +       struct file *fuse_filp = iocb_fuse->ki_filp;
> > > +       struct fuse_file *ff = fuse_filp->private_data;
> > > +       struct file *passthrough_filp = ff->passthrough_filp;
> > > +
> > > +       if (!iov_iter_count(iter))
> > > +               return 0;
> > > +
> > > +       if (is_sync_kiocb(iocb_fuse)) {
> > > +               struct kiocb iocb;
> > > +
> > > +               kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
> > > +               ret = call_read_iter(passthrough_filp, &iocb, iter);
> > > +               iocb_fuse->ki_pos = iocb.ki_pos;
> > > +               if (ret >= 0)
> > > +                       fuse_copyattr(fuse_filp, passthrough_filp, false);
> > > +
> > > +       } else {
> > > +               ret = -EIO;
> > > +       }
> > > +
> > > +       return ret;
> > > +}
> > > +
> > > +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse,
> > > +                                   struct iov_iter *iter)
> > > +{
> > > +       ssize_t ret;
> > > +       struct file *fuse_filp = iocb_fuse->ki_filp;
> > > +       struct fuse_file *ff = fuse_filp->private_data;
> > > +       struct inode *fuse_inode = file_inode(fuse_filp);
> > > +       struct file *passthrough_filp = ff->passthrough_filp;
> > > +
> > > +       if (!iov_iter_count(iter))
> > > +               return 0;
> > > +
> > > +       inode_lock(fuse_inode);
> > > +
> > > +       if (is_sync_kiocb(iocb_fuse)) {
> > > +               struct kiocb iocb;
> > > +
> > > +               kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
> > > +
> > > +               file_start_write(passthrough_filp);
> > > +               ret = call_write_iter(passthrough_filp, &iocb, iter);
> >
> > Why not vfs_iter_write()/vfs_iter_read()?
> >
> > You are bypassing many internal VFS checks that seem pretty important.
> >
>
> I've been thinking a lot about this and decided to go for the VFS bypassing
> solution because:
> 1. it looked odd to me to perform VFS checks twice, both for FUSE and lower
>    FS and it seemed to me that we found a tradeoff with Jann about doing
>    this lower FS call, and
> 2. in our Android use case (I just saw you asking for more details about
>    this in, I'll reply on the other thread), the user might have the right
>    credentials to access the FUSE file system, but not to access the lower
>    file system, so the VFS checkings would fail. So that would have created
>    the need for a credential bypassing that looked hacky.
>
> But I agree and I would probably sleep better knowing that VFS checks are
> not skipped :) So I decided to implemented the vfs_iter_{read,write}()
> variant.
>
> I again picked a lot from the overlayfs solution. In a few words, I get the
> FUSE daemon credential reference at FUSE FS creation time and, when
> passthrough read/write operations are triggered, the kernel temporarily
> overrides the requesting process' credentials with those of the FUSE
> daemon. Credentials are reverted as soon as the operation completes.
>
> I have a temporary development branch where I'm developing the V9 of this
> patch, plus the VFS variant (git history may change):
>
> https://github.com/balsini/linux/tree/fuse-passthrough-stable-v5.8-v9-vfs
>
> For now, I'm happy to say that I like this VFS solution, it also simplified
> the lower file system notifications, so I'll probably go with this in the
> V9.
>

I am happy this direction was workable, not because the overlayfs solution
is perfect, but because it already has decent mileage running into strange
corner cases and fixing them.

But I also think it is better when fuse driver performs actions on behalf
of the server, that it uses the server's credential, because this way,
the passthrough fd code path behaves logically closer to the non-passthrough
code, only (hopefully) faster.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6c0ec742ce74..c3289ff0cd33 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1552,7 +1552,9 @@  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (is_bad_inode(file_inode(file)))
 		return -EIO;
 
-	if (!(ff->open_flags & FOPEN_DIRECT_IO))
+	if (ff->passthrough_filp)
+		return fuse_passthrough_read_iter(iocb, to);
+	else if (!(ff->open_flags & FOPEN_DIRECT_IO))
 		return fuse_cache_read_iter(iocb, to);
 	else
 		return fuse_direct_read_iter(iocb, to);
@@ -1566,7 +1568,9 @@  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (is_bad_inode(file_inode(file)))
 		return -EIO;
 
-	if (!(ff->open_flags & FOPEN_DIRECT_IO))
+	if (ff->passthrough_filp)
+		return fuse_passthrough_write_iter(iocb, from);
+	else if (!(ff->open_flags & FOPEN_DIRECT_IO))
 		return fuse_cache_write_iter(iocb, from);
 	else
 		return fuse_direct_write_iter(iocb, from);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6c5166447905..21ba30a6a661 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1106,5 +1106,7 @@  void fuse_free_conn(struct fuse_conn *fc);
 
 int fuse_passthrough_setup(struct fuse_req *req, unsigned int fd);
 void fuse_passthrough_release(struct fuse_file *ff);
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 86ab4eafa7bf..44a78e02f45d 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -2,6 +2,87 @@ 
 
 #include "fuse_i.h"
 
+#include <linux/fs_stack.h>
+#include <linux/fsnotify.h>
+#include <linux/uio.h>
+
+static void fuse_copyattr(struct file *dst_file, struct file *src_file,
+			  bool write)
+{
+	if (write) {
+		struct inode *dst = file_inode(dst_file);
+		struct inode *src = file_inode(src_file);
+
+		fsnotify_modify(src_file);
+		fsstack_copy_inode_size(dst, src);
+	} else {
+		fsnotify_access(src_file);
+	}
+}
+
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb_fuse,
+				   struct iov_iter *iter)
+{
+	ssize_t ret;
+	struct file *fuse_filp = iocb_fuse->ki_filp;
+	struct fuse_file *ff = fuse_filp->private_data;
+	struct file *passthrough_filp = ff->passthrough_filp;
+
+	if (!iov_iter_count(iter))
+		return 0;
+
+	if (is_sync_kiocb(iocb_fuse)) {
+		struct kiocb iocb;
+
+		kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
+		ret = call_read_iter(passthrough_filp, &iocb, iter);
+		iocb_fuse->ki_pos = iocb.ki_pos;
+		if (ret >= 0)
+			fuse_copyattr(fuse_filp, passthrough_filp, false);
+
+	} else {
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb_fuse,
+				    struct iov_iter *iter)
+{
+	ssize_t ret;
+	struct file *fuse_filp = iocb_fuse->ki_filp;
+	struct fuse_file *ff = fuse_filp->private_data;
+	struct inode *fuse_inode = file_inode(fuse_filp);
+	struct file *passthrough_filp = ff->passthrough_filp;
+
+	if (!iov_iter_count(iter))
+		return 0;
+
+	inode_lock(fuse_inode);
+
+	if (is_sync_kiocb(iocb_fuse)) {
+		struct kiocb iocb;
+
+		kiocb_clone(&iocb, iocb_fuse, passthrough_filp);
+
+		file_start_write(passthrough_filp);
+		ret = call_write_iter(passthrough_filp, &iocb, iter);
+		file_end_write(passthrough_filp);
+
+		iocb_fuse->ki_pos = iocb.ki_pos;
+		if (ret > 0)
+			fuse_copyattr(fuse_filp, passthrough_filp, true);
+	} else {
+		ret = -EIO;
+	}
+
+	inode_unlock(fuse_inode);
+
+	return ret;
+}
+
 int fuse_passthrough_setup(struct fuse_req *req, unsigned int fd)
 {
 	int ret;